├── .clang-format ├── .gitattributes ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── Makefile ├── README.md ├── balance ├── Makefile ├── balance.cc ├── balance.lext └── build.sh ├── carmel ├── .project ├── CMakeLists.txt ├── ChangeLog ├── Doxyfile ├── LICENSE ├── Makefile ├── NOTES ├── README ├── ToDo ├── carmel ├── carmel-tutorial │ ├── carmel-training.pdf │ ├── cat.fsa │ ├── cat.fsa.trained │ ├── cat.fsa.trained.noe │ ├── cipher.data │ ├── cipher.data.noe │ ├── cipher.fst │ ├── cipher.fst.trained │ ├── cipher.gold │ ├── cipher.wfsa │ ├── cipher.wfsa.noe │ ├── cipher.wfsa.trained │ ├── cluster.data │ ├── cluster.data.noe │ ├── cluster.fsa │ ├── commands │ ├── commands.trace │ ├── delete.fst │ ├── delete.fst.trained │ ├── deltrans.data │ ├── deltrans.data.compact │ ├── epron-jpron.data │ ├── epron-jpron.fst │ ├── spellout.fst │ ├── spellout.fst.trained │ ├── tagging.data │ ├── tagging.data.noe │ ├── tagging.fsa │ ├── tagging.fsa.trained │ ├── tagging.fsa.trained.noe │ ├── tagging.fst │ ├── tagging.fst.trained │ ├── tagging.key │ ├── trans.fst │ └── trans.fst.trained ├── carmel-tutorial2.pdf ├── debug.sh ├── doc │ ├── FORMATS │ ├── carmel-tutorial.html │ ├── carmel-tutorial.ps │ └── carmel-tutorial_files │ │ ├── filelist.xml │ │ ├── image001.gif │ │ ├── image002.gif │ │ ├── image003.gif │ │ ├── image004.gif │ │ ├── image005.gif │ │ ├── image006.gif │ │ ├── image007.gif │ │ ├── image008.gif │ │ ├── image009.gif │ │ ├── image010.gif │ │ └── image011.gif ├── make-dictionary.pl ├── sample │ ├── chain.1 │ ├── chain.2 │ ├── chain.corpus │ ├── chain.mid │ ├── comments │ ├── decipher │ │ ├── README │ │ ├── cipher │ │ ├── cipher2 │ │ ├── cipherbad │ │ ├── correct │ │ ├── correct2 │ │ ├── errors.sh │ │ ├── fem.sh │ │ ├── plain.bi.wfsa │ │ ├── plain.tri.wfsa │ │ ├── subst.wfst │ │ └── to-fem.sh │ ├── do.graphviz.srilm.sh │ ├── egraph │ ├── emptyfsa │ ├── fsa1 │ ├── fsa10 │ ├── fsa11 │ ├── fsa2 │ ├── fsa3 │ ├── fsa4 │ ├── fsa5 │ ├── fsa6 │ ├── fsa7 │ ├── fsa8 │ ├── fsa9 │ ├── fst1 │ ├── fst2 │ ├── kevin_g │ ├── negative.cost.kbest │ ├── nested │ ├── space │ │ ├── letters.nosp │ │ ├── subst2.wfst │ │ ├── subst3.wfst │ │ └── words.wfsa │ ├── tag │ │ ├── README │ │ ├── channel.fst │ │ ├── cipher │ │ ├── correct │ │ └── source.fsa │ ├── test.sh │ ├── tiny.sri │ ├── tmp │ │ ├── a │ │ ├── a.2 │ │ ├── a.t │ │ ├── a1 │ │ ├── a2 │ │ ├── a3 │ │ ├── a3.t │ │ ├── aa │ │ ├── aaa │ │ ├── b1.wfsa │ │ ├── fsa1 │ │ ├── fsa2 │ │ ├── fsa3 │ │ ├── fsa4 │ │ ├── fsa4.1 │ │ ├── fsa5 │ │ ├── fsa6 │ │ ├── fsa7 │ │ ├── fst1 │ │ ├── fst1.1 │ │ ├── fst1.2 │ │ ├── fst1.3 │ │ ├── fst2 │ │ ├── t │ │ ├── t.1 │ │ ├── t1 │ │ ├── t2 │ │ ├── wfsa1 │ │ ├── wfsa2 │ │ ├── wfst1 │ │ ├── wfst2 │ │ └── wfst3 │ ├── tree-cascade │ │ ├── README │ │ ├── hidden.fsa │ │ ├── observed0.data │ │ ├── observed0.fst │ │ ├── observed1.data │ │ ├── observed1.fst │ │ ├── observed2.data │ │ ├── observed2.fst │ │ ├── s │ │ │ ├── hidden.fsa │ │ │ ├── observed0.data │ │ │ ├── observed0.fst │ │ │ ├── observed1.data │ │ │ ├── observed1.fst │ │ │ ├── observed2.data │ │ │ └── observed2.fst │ │ └── train.sh │ ├── wfsa.perplexity │ ├── wfsa1 │ ├── wfsa2 │ ├── wfsa3 │ ├── wfsa4 │ ├── wfst1 │ ├── wfst2 │ ├── wfst2.preprune │ ├── wfst3 │ ├── wfst3c │ └── wfstlog ├── src │ ├── Makefile │ ├── WARNING │ ├── cached_derivs.h │ ├── carmel.cc │ ├── cascade.h │ ├── compose.cc │ ├── compose.h │ ├── config.hpp │ ├── derivations.h │ ├── fst.cc │ ├── fst.h │ ├── gibbs.cc │ ├── gibbs.txt │ ├── models.h │ ├── sri2fsa.pl │ ├── state.h │ ├── tests │ │ ├── Makefile │ │ └── Tweight.cc │ ├── train.cc │ ├── train.h │ └── wfstio.cc ├── test │ ├── N.cascade.train.gen.sh │ ├── angela.knight.kbest.wfst │ ├── asciikana-katakana.transducer │ ├── bad.-a.1 │ ├── bad.-a.2 │ ├── compose-test.sh │ ├── determinize.usr.dict.sh │ ├── empty │ ├── epron-jpron.1.transducer │ ├── fsa7 │ ├── j-test-jap │ ├── jpron-asciikana.transducer │ ├── jpron.transducer │ ├── kbest.small.cycle │ ├── phillip.kbest │ ├── prune.test │ ├── runtests.sh │ ├── span.spell.corpus │ ├── span.spell.wfst │ ├── test.asciikana │ ├── test.compose.-a.sh │ ├── test.epron │ ├── test.final │ ├── test.jpron2 │ ├── test.kana │ ├── test.katakana │ ├── test.word │ ├── train.a │ ├── train.a.u │ ├── train.a.w │ ├── train.cascade.gen.sh │ ├── train.self.gen.sh │ ├── traintest.sh │ ├── vowel-separator.transducer │ ├── wfst2 │ ├── word-epron.names.55000wds.transducer │ ├── word.names.50000wds.transducer │ ├── y.data │ ├── y1.new │ ├── y2.new │ └── y4.new └── training-code.txt ├── catn0.cc ├── cipher ├── baseline.2.pl ├── carmel-quote-words ├── class-features ├── class-ngrams ├── class-word-fst ├── class-word-wfst ├── count-ngrams ├── decipher ├── decipher-classes ├── do-classes ├── encipher ├── epsilon-string-pairs ├── eval-classes ├── filter_docid ├── full-class-channel ├── soft-classes ├── split-words ├── sri2fsa.pl ├── summary-classes ├── text-to-classes ├── unigram-freq-bands ├── word-accuracy └── word-freq ├── clm ├── NOTES ├── clm-jan-09.pdf ├── e-parse-yield.pl ├── extract.clm.sh ├── phrasal-clm-events ├── rule_context.txt ├── shen08.pdf ├── stripEF.pl └── uniq_srilm.pl ├── forest-em ├── .gitignore ├── CMakeLists.txt ├── Makefile ├── README ├── forest-em-button.sh ├── forest-em-params.cpp ├── forest-em-params.hpp ├── forest-em.README ├── forest-em.cpp ├── forest-em.hpp ├── forest.hpp ├── forestviz.cpp └── sample │ ├── .gitignore │ ├── Makefile │ ├── best_forest │ ├── best_norm │ ├── best_weights │ ├── byid_rules │ ├── derivs │ ├── first10.deriv │ ├── first10.norm │ ├── first10.rules │ ├── first100.deriv │ ├── first100.norm │ ├── first100.rules │ ├── first1000.deriv │ ├── first1000.norm │ ├── first1000.rules │ ├── first10000.deriv │ ├── first10000.norm │ └── first10000.rules │ ├── forest │ ├── forests │ ├── forests.gz │ ├── ints │ ├── norm │ ├── norm_and_forests │ ├── raw_weight_array │ ├── rule_list │ ├── testderivs.sh │ └── tree.gz ├── gextract ├── 10k.a ├── 10k.e-parse ├── 10k.f ├── 10k.info ├── astronauts.a ├── astronauts.e-parse ├── astronauts.f ├── castronauts.a ├── castronauts.a-gold ├── castronauts.e-parse ├── castronauts.f ├── check.e-parse.py ├── dendro.py ├── do.mono.sh ├── dumpx.py ├── e-parse.format.txt ├── etree.py ├── gextract.py ├── gflags.py ├── graehl.py ├── optfunc.py ├── optfunc │ ├── __init__.py │ └── optfunc.py ├── radu2ptb.pl ├── reviz.sh ├── subset-training.py ├── training.a ├── training.e-parse ├── training.f └── tree.py ├── graehl ├── graehl.mk └── shared │ ├── .gdbinit │ ├── .gitignore │ ├── 2hash.h │ ├── 2heap.h │ ├── ChangeLog │ ├── FixedBuffer.h │ ├── Lx_norm.hpp │ ├── Makefile │ ├── README │ ├── SGT.c │ ├── SGT.counts.txt │ ├── SGT.hpp │ ├── __gmsl │ ├── _template.hpp │ ├── abs_int.hpp │ ├── accumulate.hpp │ ├── adjustableheap.hpp │ ├── adl_print.hpp │ ├── adl_to_string.hpp │ ├── align.hpp │ ├── aligned_allocator.hpp │ ├── aligned_dynamic_array.hpp │ ├── alloc_new_delete.hpp │ ├── alloc_stack.hpp │ ├── any_all.hpp │ ├── any_callable.hpp │ ├── append.hpp │ ├── arc.h │ ├── array.hpp │ ├── array_stream.hpp │ ├── assertlvl.hpp │ ├── assign_traits.hpp │ ├── assoc_container.hpp │ ├── atoi_fast.hpp │ ├── auto_report.hpp │ ├── backtrace.hpp │ ├── band_matrix.hpp │ ├── barrier.hpp │ ├── base64.hpp │ ├── batched_append.hpp │ ├── best_tree_options.hpp │ ├── bit_arithmetic.hpp │ ├── bitarray.h │ ├── bitset.hpp │ ├── blocks.c │ ├── blocks.h │ ├── breakpoint.hpp │ ├── byref.hpp │ ├── changelog.hpp │ ├── char_is.hpp │ ├── char_map.hpp │ ├── char_predicate.hpp │ ├── char_transform.hpp │ ├── charbuf.hpp │ ├── checkpoint_istream.hpp │ ├── cmdline_main.hpp │ ├── command_line.hpp │ ├── commandline.cpp.template │ ├── config.h │ ├── configurable.hpp │ ├── configure.hpp │ ├── configure_by_prototype.hpp │ ├── configure_hadoop_pipes.hpp │ ├── configure_init.hpp │ ├── configure_is.hpp │ ├── configure_named_bits.hpp │ ├── configure_noop.hpp │ ├── configure_policy.hpp │ ├── configure_program_options.hpp │ ├── configure_traits.hpp │ ├── configure_validate.hpp │ ├── container.hpp │ ├── containers.hpp │ ├── cpp11.hpp │ ├── d_ary_heap.hpp │ ├── dbg_level.hpp │ ├── debuggable.hpp │ ├── debugger.mk │ ├── debugprint.hpp │ ├── default_pool_alloc.hpp │ ├── default_print_on.hpp │ ├── defaulted.hpp │ ├── delta_sum.hpp │ ├── delta_sum_remember.hpp │ ├── digamma.hpp │ ├── doubling_primes.hpp │ ├── dual_mempool.hpp │ ├── dummy.hpp │ ├── dynamic_array.hpp │ ├── dynamic_hash_cache.hpp │ ├── dynamic_sized.hpp │ ├── em.hpp │ ├── epsilon.hpp │ ├── escape3.hpp │ ├── exact_cast.hpp │ ├── example-cpp-with-boost-options.cpp │ ├── example.Makefile │ ├── example_value.hpp │ ├── farmhash.hpp │ ├── fast_lexical_cast.hpp │ ├── fileargs.cpp │ ├── fileargs.hpp │ ├── fileheader.hpp │ ├── filelines.hpp │ ├── filter_file_stream.hpp │ ├── find_string.hpp │ ├── fixed_array.hpp │ ├── fixed_pool.h │ ├── flag.hpp │ ├── force_link.hpp │ ├── format.hpp │ ├── from_strings.hpp │ ├── ftoa.hpp │ ├── ftoa_append.hpp │ ├── ftoa_ieee.hpp │ ├── ftos.hpp │ ├── funcs.hpp │ ├── function.hpp │ ├── function_macro.hpp │ ├── function_output_iterator.hpp │ ├── gen-base_construct.ipp │ ├── genio.h │ ├── gibbs.hpp │ ├── gibbs_opts.hpp │ ├── glibc_memcpy.hpp │ ├── glog.hpp │ ├── gmsl │ ├── good_alloc_size.hpp │ ├── graph.cc │ ├── graph.h │ ├── graph.hpp │ ├── graphviz.hpp │ ├── gzstream.cpp │ ├── gzstream.h │ ├── gzstream.hpp │ ├── has_print.hpp │ ├── hash.hpp │ ├── hash_cache.hpp │ ├── hash_city.hpp │ ├── hash_functions.hpp │ ├── hash_jenkins.hpp │ ├── hash_murmur.hpp │ ├── hashbench.cpp │ ├── hashed_value.hpp │ ├── hashtable_fwd.hpp │ ├── have_64_bits.hpp │ ├── hex_int.hpp │ ├── hypergraph.hpp │ ├── identity.hpp │ ├── ifdbg.hpp │ ├── ilinenostream.hpp │ ├── indent_level.hpp │ ├── indexed.hpp │ ├── indexgraph.hpp │ ├── indices_after.hpp │ ├── indirect.hpp │ ├── info_debug.hpp │ ├── inline.hpp │ ├── input_error.hpp │ ├── insert_to.hpp │ ├── int_hash_map.hpp │ ├── int_types.hpp │ ├── interruption_point.hpp │ ├── intorpointer.hpp │ ├── intrusive_refcount.hpp │ ├── io.hpp │ ├── is_container.hpp │ ├── is_null.hpp │ ├── itoa.hpp │ ├── karma_tostr.hpp │ ├── kbest-test.cc │ ├── kbest.cc │ ├── kbest.h │ ├── key_to_blob.hpp │ ├── large_streambuf.hpp │ ├── lazier_forest.hpp │ ├── lazy_forest_kbest.hpp │ ├── lazy_forest_kbest_test.hpp │ ├── lc_ascii.hpp │ ├── leaf_configurable.hpp │ ├── leb128.hpp │ ├── lerp.hpp │ ├── likely.hpp │ ├── list.h │ ├── lock_policy.hpp │ ├── log_intsize.hpp │ ├── lz4.c │ ├── lz4.h │ ├── lz4.hpp │ ├── lz4stream.hpp │ ├── main.hpp │ ├── main.msvc.hpp │ ├── map_from_set.hpp │ ├── math_constants.hpp │ ├── maybe_update_bound.hpp │ ├── mdb_from_db.1 │ ├── mdb_from_db.c │ ├── mean_field_normalize.hpp │ ├── mean_field_scale.hpp │ ├── memleak.hpp │ ├── memmap.hpp │ ├── memoindex.hpp │ ├── memory_archive.hpp │ ├── memory_stats.hpp │ ├── memory_stream.hpp │ ├── monotonic_time.hpp │ ├── must_eof.hpp │ ├── myassert.h │ ├── named_enum.hpp │ ├── named_main.hpp │ ├── nan.hpp │ ├── nary_tree.hpp │ ├── new_shared.hpp │ ├── nibble_array.hpp │ ├── no_locking.hpp │ ├── nondet_random.cpp │ ├── noreturn.hpp │ ├── normalize.hpp │ ├── normalize_range.hpp │ ├── null_deleter.hpp │ ├── null_ostream.hpp │ ├── null_output_iterator.hpp │ ├── null_terminated.hpp │ ├── optional_pair.hpp │ ├── order_preserving.hpp │ ├── os.hpp │ ├── os_memory.hpp │ ├── outedges.hpp │ ├── packedalloc.hpp │ ├── pairlist.hpp │ ├── parse_float.hpp │ ├── path_traits.hpp │ ├── percent.hpp │ ├── periodic.hpp │ ├── pod.hpp │ ├── podcpy.hpp │ ├── pointer_int.hpp │ ├── pointer_traits.hpp │ ├── pointeroffset.hpp │ ├── pool_construct.ipp │ ├── pool_traits.hpp │ ├── power_of_10.hpp │ ├── predicate_compose.hpp │ ├── prefix_option.hpp │ ├── print_read.hpp │ ├── print_width.hpp │ ├── printlines.hpp │ ├── priority_queue.hpp │ ├── proc_linux.hpp │ ├── program_options.hpp │ ├── program_options_config_example.txt │ ├── program_options_path.hpp │ ├── property.hpp │ ├── property_factory.hpp │ ├── push_backer.hpp │ ├── quote.hpp │ ├── random.hpp │ ├── random.ipp │ ├── randomreader.hpp │ ├── read_stream.hpp │ ├── reconstruct.hpp │ ├── reduce.hpp │ ├── replace_digits.hpp │ ├── reserved_memory.hpp │ ├── safe_bool.hpp │ ├── safe_db.hpp │ ├── sample │ ├── sample.graph │ ├── sample.lattice │ ├── sample.lattice.carmel │ └── simple.cycle.graph │ ├── segments.hpp │ ├── semiring.hpp │ ├── serialize_batch.hpp │ ├── serialize_config.hpp │ ├── set_difference.hpp │ ├── shared_ptr.hpp │ ├── shell.hpp │ ├── shell_escape.hpp │ ├── show.hpp │ ├── simple_serialize.hpp │ ├── siphash.hpp │ ├── size_mega.hpp │ ├── slist.h │ ├── small_vector.hpp │ ├── snprintf.hpp │ ├── sparse_vector.hpp │ ├── split.hpp │ ├── split_noquote.hpp │ ├── stable_vector.hpp │ ├── stackalloc.hpp │ ├── stacktrace.hpp │ ├── static_fgets_buf.h │ ├── static_itoa.h │ ├── statistics.hpp │ ├── stopwatch.hpp │ ├── stream_util.hpp │ ├── stream_whitespace.hpp │ ├── strhash.cc │ ├── strhash.h │ ├── stride.hpp │ ├── string.hpp │ ├── string_buffer.hpp │ ├── string_builder.hpp │ ├── string_match.hpp │ ├── string_to.hpp │ ├── string_tr.hpp │ ├── stringable.hpp │ ├── stringkey.cc │ ├── stringkey.h │ ├── strstrsep.c │ ├── strstrsep.h │ ├── swap_pod.hpp │ ├── swapbatch.hpp │ ├── symbol.hpp │ ├── tails_up_hypergraph.hpp │ ├── teestream.hpp │ ├── test.hpp │ ├── test │ ├── LazyKbestTrees_test.cpp │ ├── Makefile │ ├── backtrace.cpp │ ├── epsilon.cpp │ ├── make.sh │ ├── make_kbest.sh │ ├── slist.cpp │ ├── tree.cpp │ └── weight_underflow.cpp │ ├── text-to-cc.cpp │ ├── the_null_ostream.hpp │ ├── thread_group.hpp │ ├── threadlocal.hpp │ ├── time_report.hpp │ ├── time_series.hpp │ ├── time_space_report.hpp │ ├── to_from_buf.hpp │ ├── tree.hpp │ ├── treetrie.hpp │ ├── triangular_array.hpp │ ├── type_string.hpp │ ├── type_traits.hpp │ ├── umod.hpp │ ├── unimplemented.hpp │ ├── unlimit_memlock.hpp │ ├── unordered.hpp │ ├── unthreaded_ptr.hpp │ ├── validate.hpp │ ├── value_str.hpp │ ├── verbose_exception.hpp │ ├── warn.hpp │ ├── warning_compiler.h │ ├── warning_pop.h │ ├── warning_push.h │ ├── weight.cc │ ├── weight.h │ ├── word_spacer.hpp │ ├── words.h │ └── zip_builder.hpp ├── sblm ├── 10.counted ├── 10.eng-parse ├── 10.for-norm ├── Makefile ├── README ├── TODO ├── add-pcfg-feature ├── cat-pcfg-for-divide ├── count.py ├── dumpx.py ├── etree-stats.py ├── etree.py ├── example.py ├── fast-lhs-sums-map ├── graehl.py ├── had-pcfg-probs ├── lhs-sums-map ├── nbest-sblm.py ├── nbest.py ├── ngram.py ├── optfunc.py ├── pcfg-backoff ├── pcfg-map ├── pcfg-map-precomb ├── pcfg.py ├── precombine.py ├── rules ├── sample │ ├── dev.e-parse │ ├── test.e-parse │ └── training.e-parse ├── sbmtrule.py ├── test.sh ├── test.txt ├── tree.py └── xrs-pcfg-events.cpp ├── utf8.h ├── utf8 ├── checked.h ├── core.h └── unchecked.h └── util ├── .aspell.en.pws ├── .emacs ├── .gdbinit ├── .gitconfig ├── .gitignore ├── .octaverc ├── .svn.authorsfile ├── 1count.cc ├── C-small.cc ├── add_paths.sh ├── addlicense.sh ├── aliases.sh ├── alignment-links.py ├── bash.txt ├── bashlib.sh ├── bl3.sh ├── bloblib.sh ├── c++space ├── camelcase.pl ├── ccache-wrapper.sh ├── charvocab.py ├── check-condor ├── cj ├── close-ns-inplace.pl ├── codejam-example.cc ├── codejam.hh ├── color.xetex ├── config.fish ├── datespan.py ├── dependencies.sh ├── dictdiff ├── diffnbest.pl ├── dotprod.py ├── dropcaches.c ├── dumpx.py ├── edit ├── emacs.reg ├── etree.py ├── extract-field-fast.pl ├── extract-field.pl ├── featstats.py ├── findscripts.sh ├── fix-include-guard-inplace.pl ├── fixunrpn_ ├── float-round.pl ├── forall.sub ├── format-doxygen-c-comment ├── gcc.sh ├── giraffe ├── giraffe.0.3 ├── giraffe.split ├── gist ├── git-completion.bash ├── gitalias.sh ├── gitcredit ├── gnuplot.auto.inc ├── graehl.py ├── growth ├── hexnorm.pl ├── hippie-expand.emacs.txt ├── identity.py ├── indent-c-comment ├── inpy ├── insert_attributes_opl.pl ├── iomr-hadoop ├── joinleft ├── lc1count.cc ├── libgraehl.pl ├── license.txt ├── localgcc.sh ├── localhistory.sh ├── log_fn.py ├── mflist.pl ├── misc.sh ├── monitor.py ├── nbest.py ├── newwin.sh ├── nfeats ├── no-trailing-space-inplace.pl ├── optfunc.py ├── osx-setup.sh ├── pandoc.constantia.css ├── pandoc.css ├── parharmonize.cc ├── pcfg.py ├── pragma_once.py ├── printers.py ├── pychecks.sh ├── qsh ├── random-c-array.py ├── randomwords.py ├── ref-updated ├── reject_chars.py ├── relpath ├── relpathp ├── remove_namespace.py ├── sample └── alignment-links.tsv ├── shortenpar.pl ├── space-brace-inplace.pl ├── split.lua ├── splitutf8.pl ├── start-hadoop ├── stats.py ├── subst.pl ├── subst.pypy.sh ├── summarize_num.pl ├── svndiff.sh ├── template.py ├── test.grf ├── textToC.py ├── time.sh ├── udump ├── unionfind.hh ├── unrpn_ ├── valgrind.supp ├── viz-tree-string-pair.pl ├── why-empty.pl ├── windows-vista-fonts.sh ├── xetex.template ├── xetex.template2 └── yuminstall.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | .sln text eol=crlf 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bin 2 | .DS_Store 3 | deps 4 | obj 5 | *~ 6 | *.pyc 7 | lc1count 8 | .history 9 | trash.* 10 | carmel/test/logs 11 | carmel/test/span.spell.trained2 12 | openfst 13 | util/reveal.js 14 | latest.log 15 | balance/balance 16 | *.lex.cc 17 | *.lex.hh 18 | util/aliases.sh 19 | util/.bashrc 20 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(USE_BOOST_ROOT "/local/graehl/c/xmt-externals/FC12/libraries/boost-1.69.0" CACHE STRING 2 | "If this exists, it's used instead of system path when finding boost libs") 3 | option(USE_STATIC "prefer static linking except for libc" ON) 4 | add_definitions(-fvisibility=hidden) 5 | cmake_minimum_required(VERSION 3.15) 6 | project(all) 7 | if (USE_STATIC) 8 | set(ZLIB_USE_STATIC_LIBS ON) 9 | set(ZSTD_USE_STATIC_LIBS ON) 10 | set(Boost_USE_STATIC_LIBS ON) 11 | endif() 12 | set(Boost_require_VERSION 1.53) # earlier is probably fine 13 | set(CMAKE_CXX_STANDARD 17) 14 | include_directories("${PROJECT_SOURCE_DIR}") 15 | list(APPEND subdirs 16 | carmel 17 | forest-em 18 | ) 19 | 20 | macro(our_boost_libs) 21 | if (EXISTS "${USE_BOOST_ROOT}") 22 | set(Boost_NO_SYSTEM_PATHS 1) 23 | set(BOOST_ROOT "${USE_BOOST_ROOT}") 24 | set(BOOST_INCLUDEDIR "${USE_BOOST_ROOT}/include") 25 | endif() 26 | find_package(Boost "${Boost_require_VERSION}" COMPONENTS ${ARGV} REQUIRED) 27 | include_directories(${BOOST_INCLUDE_DIR}) 28 | foreach(BLIB ${ARGV}) 29 | list(APPEND OUR_BOOST_LIBS "Boost::${BLIB}") 30 | endforeach() 31 | message("boost libs: ${OUR_BOOST_LIBS}") 32 | endmacro() 33 | 34 | foreach(subdir ${subdirs}) 35 | add_subdirectory(${subdir}) 36 | endforeach() 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | (see carmel/LICENSE for terms covering the carmel/ subproject) 2 | 3 | Copyright 2011 Jonathan Graehl - http://graehl.org/ 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | cd carmel && make -j 4 3 | -------------------------------------------------------------------------------- /balance/Makefile: -------------------------------------------------------------------------------- 1 | STD = -std=c++11 2 | STDLIB = -stdlib=libc++ 3 | CXXFLAGS = $(STD) $(STDLIB) 4 | balance: balance.lex.cc balance.cc 5 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) -o $@ $^ 6 | %.lex.cc %.lex.hh: %.lext 7 | flex --header-file=$*.lex.hh --outfile=$*.lex.cc $< 8 | -------------------------------------------------------------------------------- /balance/balance.cc: -------------------------------------------------------------------------------- 1 | /** \file 2 | 3 | perhaps a more practical way to solve unbalanced brace (C++ namespace) confusion: 4 | 5 | use mcpp (https://github.com/h8liu/mcpp) to expand your .cpp 6 | 7 | then add { } at the beginning and end of the file. then emacs syntax-table 8 | sexpr navigation from eof back will show you the dangling open (excess closes tend to 9 | error out in compile more intuitively). 10 | 11 | C11 lexer from http://www.quut.com/c/ANSI-C-grammar-l-2011.html 12 | */ 13 | 14 | #include "balance.lex.hh" 15 | #include 16 | #include 17 | #include 18 | 19 | using namespace std; 20 | 21 | void err(char const* msg) { 22 | cerr << msg << '\n'; 23 | abort(); 24 | } 25 | 26 | void run(istream& in, char const* name) { 27 | cerr << name << " ...\n"; 28 | yyFlexLexer l(&in, &cerr); 29 | while (in && l.yylex()) { 30 | cout << string(l.YYText(), l.YYLeng()) << "\n"; 31 | } 32 | cout << '\n'; 33 | } 34 | 35 | void run(char const* name) { 36 | ifstream in(name); 37 | if (!in) err(name); 38 | run(in, name); 39 | } 40 | 41 | int main(int argc, char* argv[]) { 42 | if (argc < 2) 43 | run(cin, "[STDIN]"); 44 | else 45 | for (int i = 1; i < argc; ++i) run(argv[i]); 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /balance/build.sh: -------------------------------------------------------------------------------- 1 | LD=gcc CPP=gcc CXX=g++ CC=gcc CFLAGS= make 2 | -------------------------------------------------------------------------------- /carmel/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(carmel) 2 | our_boost_libs(random timer) 3 | add_executable(carmel src/carmel.cc src/fst.cc src/train.cc src/gibbs.cc) 4 | target_link_libraries(carmel ${OUR_BOOST_LIBS}) 5 | -------------------------------------------------------------------------------- /carmel/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/LICENSE -------------------------------------------------------------------------------- /carmel/NOTES: -------------------------------------------------------------------------------- 1 | 2 | bayestag / scala version regression for -blocked 3 | 4 | -------------------------------------------------------------------------------- /carmel/ToDo: -------------------------------------------------------------------------------- 1 | features: 2 | training with conditional normalization - option to leave alone or assign uniform weights to arcs with zero counts for some input leaving some state (but often people like zeroing out unused arcs for pruning?) 3 | attach and preserve arbitrary labels to arcs (states already have their arbitrary names) 4 | 'wildcard', 'except-state' and 'except-global' label for arcs - for input or output not seen leaving the state, or at all 5 | external input/output dictionary files? binary format? 6 | option to iteratively sum paths with e-cycles for -S and -t. use matrix math (e-paths of length 0,1,2,3... = 1+A^1+A^2+ ...) 7 | 8 | code: 9 | List::count_length for input sequences - use iterators instead? 10 | unnecessary copying of path lists due to lameness of output iterator / insert_iterator 11 | STL allocator - get rid of CUSTOMNEW mess (is it faster?) 12 | command-line regression tests 13 | unit tests 14 | STL hash 15 | one filename per public class: Class.h Class.cc 16 | 17 | bugs: 18 | -------------------------------------------------------------------------------- /carmel/carmel: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ -z "$ARCH" ] ; then 3 | u=`uname` 4 | ARCH=cygwin 5 | if [ $u = Linux ] ; then 6 | ARCH=linux 7 | fi 8 | if [ $u = SunOS ] ; then 9 | ARCH=solaris 10 | fi 11 | if [ $u = Darwin ] ; then 12 | ARCH=macosx 13 | fi 14 | fi 15 | realprog=$0 16 | d=`dirname $realprog` 17 | if [ -L $realprog ] ; then 18 | if [ -x "`which readlink`" ] ; then 19 | realprog=`readlink $0` 20 | if [ ${realprog:0:1} = / ] ; then #absolute path 21 | d=`dirname $realprog` 22 | else 23 | d=$d/`dirname $realprog` 24 | fi 25 | fi 26 | fi 27 | 28 | if [ $ARCH = Darwin ]; then 29 | exec $d/$ARCH/carmel $* 30 | else 31 | exec $d/$ARCH/carmel.static $* 32 | fi 33 | 34 | -------------------------------------------------------------------------------- /carmel/carmel-tutorial/carmel-training.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/carmel-tutorial/carmel-training.pdf -------------------------------------------------------------------------------- /carmel/carmel-tutorial/cat.fsa: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 *e* "space")) 3 | (0 (c1 *e* "c1")) 4 | (0 (c2 *e* "c2")) 5 | (0 (c3 *e* "c3")) 6 | (c1 (0 *e* "space")) 7 | (c1 (c1 *e* "c1")) 8 | (c1 (c2 *e* "c2")) 9 | (c1 (c3 *e* "c3")) 10 | (c2 (0 *e* "space")) 11 | (c2 (c1 *e* "c1")) 12 | (c2 (c2 *e* "c2")) 13 | (c2 (c3 *e* "c3")) 14 | (c3 (0 *e* "space")) 15 | (c3 (c1 *e* "c1")) 16 | (c3 (c2 *e* "c2")) 17 | (c3 (c3 *e* "c3")) 18 | -------------------------------------------------------------------------------- /carmel/carmel-tutorial/cat.fsa.trained: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 *e* "space" 0.0896584819643305)) 3 | (0 (1 *e* "c1" 0.636327421275486)) 4 | (0 (2 *e* "c2" 0.274013807990881)) 5 | (0 (3 *e* "c3" 2.88769303122129e-07)) 6 | (1 (0 *e* "space" 4.06374797674586e-05)) 7 | (1 (1 *e* "c1" 0.124246995161629)) 8 | (1 (2 *e* "c2" 0.874856889754987)) 9 | (1 (3 *e* "c3" 0.00085547760361627)) 10 | (2 (0 *e* "space" 0.153523266340634)) 11 | (2 (1 *e* "c1" 0.116884883887446)) 12 | (2 (2 *e* "c2" 0.125343236298317)) 13 | (2 (3 *e* "c3" 0.604248613473604)) 14 | (3 (0 *e* "space" 0.535737245619616)) 15 | (3 (1 *e* "c1" 0.0455422297669097)) 16 | (3 (2 *e* "c2" 0.0491608536591124)) 17 | (3 (3 *e* "c3" 0.369559670954362)) 18 | -------------------------------------------------------------------------------- /carmel/carmel-tutorial/cat.fsa.trained.noe: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 "space" "space" 0.0896584819643305)) 3 | (0 (1 "c1" "c1" 0.636327421275486)) 4 | (0 (2 "c2" "c2" 0.274013807990881)) 5 | (0 (3 "c3" "c3" 2.88769303122129e-07)) 6 | (1 (0 "space" "space" 4.06374797674586e-05)) 7 | (1 (1 "c1" "c1" 0.124246995161629)) 8 | (1 (2 "c2" "c2" 0.874856889754987)) 9 | (1 (3 "c3" "c3" 0.00085547760361627)) 10 | (2 (0 "space" "space" 0.153523266340634)) 11 | (2 (1 "c1" "c1" 0.116884883887446)) 12 | (2 (2 "c2" "c2" 0.125343236298317)) 13 | (2 (3 "c3" "c3" 0.604248613473604)) 14 | (3 (0 "space" "space" 0.535737245619616)) 15 | (3 (1 "c1" "c1" 0.0455422297669097)) 16 | (3 (2 "c2" "c2" 0.0491608536591124)) 17 | (3 (3 "c3" "c3" 0.369559670954362)) 18 | -------------------------------------------------------------------------------- /carmel/carmel-tutorial/delete.fst: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 "a" "a")) 3 | (0 (0 "a" *e*)) 4 | (0 (0 "b" "b")) 5 | (0 (0 "b" *e*)) 6 | (0 (0 "c" "c")) 7 | (0 (0 "c" *e*)) 8 | -------------------------------------------------------------------------------- /carmel/carmel-tutorial/delete.fst.trained: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 "a") (0 "a" *e* 1.36958138882441e-18) (0 "b" 2.88391710752266e-18) (0 "b" *e*) (0 "c") (0 "c" *e* 7.23771649369207e-20)) 3 | -------------------------------------------------------------------------------- /carmel/carmel-tutorial/deltrans.data: -------------------------------------------------------------------------------- 1 | "a" "b" "c" 2 | "b" "a" 3 | "a" "b" "c" 4 | "a" "c" 5 | "a" "c" 6 | "b" "c" 7 | "a" "c" 8 | "b" "c" 9 | -------------------------------------------------------------------------------- /carmel/carmel-tutorial/deltrans.data.compact: -------------------------------------------------------------------------------- 1 | 1 2 | "a" "b" "c" 3 | "b" "a" 4 | 1 5 | "a" "b" "c" 6 | "a" "c" 7 | 2 8 | "a" "c" 9 | "b" "c" 10 | -------------------------------------------------------------------------------- /carmel/carmel-tutorial/epron-jpron.data: -------------------------------------------------------------------------------- 1 | "L" "IY" "N" 2 | "R" "I" "N" 3 | "R" "AE" "N" 4 | "R" "A" "N" 5 | "F" "AE" "N" 6 | "H" "A" "N" 7 | "L" "AY" "N" 8 | "R" "A" "I" "N" 9 | -------------------------------------------------------------------------------- /carmel/carmel-tutorial/trans.fst: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 "a" "a")) 3 | (0 (0 "a" "b")) 4 | (0 (0 "a" "c")) 5 | (0 (0 "b" "a")) 6 | (0 (0 "b" "b")) 7 | (0 (0 "b" "c")) 8 | (0 (0 "c" "a")) 9 | (0 (0 "c" "b")) 10 | (0 (0 "c" "c")) 11 | -------------------------------------------------------------------------------- /carmel/carmel-tutorial/trans.fst.trained: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 "a" 0.25) (0 "a" "b" 0.75) (0 "b" "a" 0.999726408340245) (0 "b" 0.000273591658524948) (0 "b" "c" 1.22743061972586e-12) (0 "c" "a" 0.25) (0 "c" 0.75)) 3 | -------------------------------------------------------------------------------- /carmel/carmel-tutorial2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/carmel-tutorial2.pdf -------------------------------------------------------------------------------- /carmel/debug.sh: -------------------------------------------------------------------------------- 1 | gdb --args bin/$ARCH/carmel.debug -k 2 test/kbest.small.cycle 2 | -------------------------------------------------------------------------------- /carmel/doc/carmel-tutorial_files/filelist.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /carmel/doc/carmel-tutorial_files/image001.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/doc/carmel-tutorial_files/image001.gif -------------------------------------------------------------------------------- /carmel/doc/carmel-tutorial_files/image002.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/doc/carmel-tutorial_files/image002.gif -------------------------------------------------------------------------------- /carmel/doc/carmel-tutorial_files/image003.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/doc/carmel-tutorial_files/image003.gif -------------------------------------------------------------------------------- /carmel/doc/carmel-tutorial_files/image004.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/doc/carmel-tutorial_files/image004.gif -------------------------------------------------------------------------------- /carmel/doc/carmel-tutorial_files/image005.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/doc/carmel-tutorial_files/image005.gif -------------------------------------------------------------------------------- /carmel/doc/carmel-tutorial_files/image006.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/doc/carmel-tutorial_files/image006.gif -------------------------------------------------------------------------------- /carmel/doc/carmel-tutorial_files/image007.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/doc/carmel-tutorial_files/image007.gif -------------------------------------------------------------------------------- /carmel/doc/carmel-tutorial_files/image008.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/doc/carmel-tutorial_files/image008.gif -------------------------------------------------------------------------------- /carmel/doc/carmel-tutorial_files/image009.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/doc/carmel-tutorial_files/image009.gif -------------------------------------------------------------------------------- /carmel/doc/carmel-tutorial_files/image010.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/doc/carmel-tutorial_files/image010.gif -------------------------------------------------------------------------------- /carmel/doc/carmel-tutorial_files/image011.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/doc/carmel-tutorial_files/image011.gif -------------------------------------------------------------------------------- /carmel/make-dictionary.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use Getopt::Long; 4 | 5 | #outputs an FSA of character-recognizer for lines on STDIN (newline excluded) 6 | 7 | my $end="END"; 8 | my $start=0; 9 | 10 | print "$end\n"; 11 | 12 | my $s=1; 13 | my $random=0; 14 | my $weighted=0; 15 | 16 | GetOptions("random!"=>\$random 17 | ,"weighted!"=>\$weighted 18 | ) || die; 19 | 20 | 21 | sub quote_char { 22 | my ($c)=@_; 23 | $c='\"' if $c eq '"'; 24 | return qq{"$c"}; 25 | } 26 | 27 | my $num_match=qr/(?:[+\-]|\b)[0123456789]+(?:[.][0123456789]*(?:[eE][0123456789\-+]*)?)?/; 28 | 29 | while(<>) { 30 | my $w=1; 31 | if ($weighted) { 32 | s/((?:e\^|10\^)?$num_match(?:ln|log)?)\s+// || die "no weight found for line $_ with --weighted"; 33 | $w=$1; 34 | } 35 | $w=1-rand(1) if $random; 36 | my $p=$start; 37 | chomp; 38 | my @c=split //,$_; 39 | for (0..$#c) { 40 | my $d=($_==$#c)?$end:$s++; 41 | print "($p $d ","e_char($c[$_]); 42 | print " $w" if $w ne '1' && $_==0; 43 | print ")\n"; 44 | $p=$d; 45 | } 46 | } 47 | 48 | -------------------------------------------------------------------------------- /carmel/sample/chain.1: -------------------------------------------------------------------------------- 1 | 2 2 | (0 (2 0.1) (1 *e* b 0.6) (0 *e* a 0.3) (4 *e* a .1)) 3 | (1 (2 *e* a 0.3) (0 *e* b 0.7)) 4 | (2) 5 | (4 (2 *e* c)) 6 | -------------------------------------------------------------------------------- /carmel/sample/chain.2: -------------------------------------------------------------------------------- 1 | 1 2 | (0 (1 *e* 1) (0 a c .6) (0 a d .4) (0 b d .2) (0 b e .8)) 3 | (1) 4 | -------------------------------------------------------------------------------- /carmel/sample/chain.corpus: -------------------------------------------------------------------------------- 1 | 2 | d e c 3 | 4 | d d 5 | 6 | 7 | -------------------------------------------------------------------------------- /carmel/sample/chain.mid: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 a b .1) (0 b a .2) (0 a a .9) (0 b b .8)) 3 | -------------------------------------------------------------------------------- /carmel/sample/comments: -------------------------------------------------------------------------------- 1 | S 2 | #aasdf 3 | #A 4 | (S (S "PRO 5 | #" "they" -20log)) 6 | (S (S "AUX" "can" -20ln)) 7 | (S (S "VERB" "can" 200log)) 8 | (S (S "NOUN" "fish" 0.0001)) 9 | (S (S "VERB" "fish" 0.0001)) 10 | -------------------------------------------------------------------------------- /carmel/sample/decipher/README: -------------------------------------------------------------------------------- 1 | unsupervised decipherment of a 1-1 letter subst. cipher (carmel knows nothing about 1-1, however) 2 | -------------------------------------------------------------------------------- /carmel/sample/decipher/correct: -------------------------------------------------------------------------------- 1 | 2 | _ D E C I P H E R M E N T _ I S _ 3 | 4 | _ T H E _ A N A L Y S I S _ O F _ D O C U M E N T S _ W R I T T E N _ I N _ 5 | 6 | _ A N C I E N T _ L A N G U A G E S _ W H E R E _ T H E _ L A N G U A G E _ I S _ U N K N O W N _ O R _ 7 | 8 | _ K N O W L E D G E _ O F _ T H E _ L A N G U A G E _ H A S _ B E E N _ L O S T _ I T _ I S _ C L O S E L Y _ 9 | 10 | _ R E L A T E D _ T O _ C R Y P T A N A L Y S I S _ T H E _ D I F F E R E N C E _ B E I N G _ T H A T _ T H E _ 11 | 12 | _ O R I G I N A L _ D O C U M E N T _ W A S _ N O T _ D E L I B E R A T E L Y _ W R I T T E N _ T O _ B E _ 13 | 14 | _ D I F F I C U L T _ T O _ D E C I P H E R _ T H E _ T E R M _ H A S _ A L S O _ B E E N _ U S E D _ T O _ 15 | 16 | _ D E S C R I B E _ T H E _ A N A L Y S I S _ O F _ T H E _ G E N E T I C _ C O D E _ S E E _ T H E _ H U M A N _ 17 | 18 | _ G E N O M E _ P R O J E C T _ F O R _ M O R E _ O N _ T H I S _ S O M E _ P E O P L E _ H A V E _ A L S O _ U S E D _ 19 | 20 | _ T H E _ W O R D _ M E T A P H O R I C A L L Y _ T O _ M E A N _ S O M E T H I N G _ L I K E _ U N D E R S T A N D I N G _ 21 | -------------------------------------------------------------------------------- /carmel/sample/decipher/errors.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | function eval-res { 4 | cat $1 | tr ' ' '\012' | awk 'NF > 0' | tr -d '"' >z1 5 | cat $2 | tr ' ' '\012' | awk 'NF > 0' | tr -d '"' >z2 6 | echo `paste -d ' ' z1 z2 | awk '$1 != $2' | wc -l` 7 | #echo `diff -y z1 z2 | egrep '(\||<|>)' | wc -l` '('`diff z1 z2 | grep '^<' | wc -l`')' 8 | #grep '^<' | wc -l 9 | } 10 | echo ${suf:=.trained} ${csuf:=2} ${carmel:=carmel} ${chanbase=subst.wfst} 11 | echo ${src:=plain.bi.wfsa} ${chan:=$chanbase$suf} ${cipher:=cipher$csuf} ${correct:=correct$csuf} ${log:=errors.log} 12 | 13 | if [ "$weights" ] ; then 14 | suf=`basename $weights` 15 | set -x 16 | $carmel -H --load-fem-param=$weights $src $chanbase --no-compose --write-loaded=$suf 17 | chan=$chanbase.$suf 18 | set +x 19 | fi 20 | 21 | $carmel -HJ -= 3.0 $chan > $chan.cubed 22 | $carmel --project-right --project-identity-fsa $src > $src.id 23 | function errors_chan 24 | { 25 | $carmel -qbsriQIWEk 1 $src.id $1 < $cipher > $chan.decode 26 | echo "errors $2 = " `eval-res $correct $chan.decode ` 27 | } 28 | ( 29 | echo 'length of text = ' `tr -d '"_' < $correct | wc -w` 30 | errors_chan $chan " "; 31 | errors_chan $chan.cubed "cubed" 32 | ) 2>&1 | tee $log 33 | -------------------------------------------------------------------------------- /carmel/sample/decipher/fem.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | echo ${suf:=fem} ${ITER:=500} ${restarts:=0} 3 | i4=$((ITER/4)) 4 | [ "$EM" ] || CRP=1 5 | if [ "$EM" ] ; then 6 | $fem -f forest -H -n norm -I param -e 0 -o $suf.em -i $i4 -r $restarts 7 | weights=$suf.em ./errors.sh 8 | fi 9 | if [ "$CRP" ] ; then 10 | if [ "$DA" ] ; then 11 | sda=".crp.da=.$DA" 12 | argda="--high-temp=2 --low-temp=$DA" 13 | fi 14 | $fem -f forest -H -n norm -I param -e 0 -o $suf$sda $argda --crp=$ITER --burnin=$i4 --alpha=alpha 15 | weights=$suf$sda ./errors.sh 16 | fi 17 | -------------------------------------------------------------------------------- /carmel/sample/decipher/to-fem.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | $carmel --train-cascade -aHJmM -1 cipher2 plain.bi.wfsa subst.wfst --priors=1e5,1e-2 --fem-norm=norm --fem-forest=forest --fem-param=param --normby=NC --fem-alpha=alpha 3 | -------------------------------------------------------------------------------- /carmel/sample/do.graphviz.srilm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | B=${B:-../bin/cage/carmel.debug} 3 | ../src/sri2fsa.pl < tiny.sri > tiny.fsa;$B tiny.fsa -YZB > tiny.dot;dot -O -Tpdf tiny.dot;dot -O -Gdpi=150 -Tpng tiny.dot 4 | -------------------------------------------------------------------------------- /carmel/sample/egraph: -------------------------------------------------------------------------------- 1 | 4 2 | (1 (2 *e* 2) (3 *e* .5) (4 *e* 8)) 3 | (2 (3 *e* 1) (4 *e* 3)) 4 | (3 (4 *e* 1)) 5 | -------------------------------------------------------------------------------- /carmel/sample/emptyfsa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/sample/emptyfsa -------------------------------------------------------------------------------- /carmel/sample/fsa1: -------------------------------------------------------------------------------- 1 | 3 2 | (0 (1 "he")) 3 | (1 (2 "saw")) 4 | (2 (3 "me")) 5 | (1 (4 "ran")) 6 | (4 (3 "home")) 7 | (0 (5 "she")) 8 | (5 (3 "talked")) 9 | -------------------------------------------------------------------------------- /carmel/sample/fsa10: -------------------------------------------------------------------------------- 1 | 1 2 | (0 1) 3 | -------------------------------------------------------------------------------- /carmel/sample/fsa11: -------------------------------------------------------------------------------- 1 | 1 2 | (0 (0 a) (0 .5) (0) (1) (1 b)) 3 | -------------------------------------------------------------------------------- /carmel/sample/fsa2: -------------------------------------------------------------------------------- 1 | F 2 | (S (A "he")) 3 | (A (B "ran")) 4 | (B (F "home")) 5 | -------------------------------------------------------------------------------- /carmel/sample/fsa3: -------------------------------------------------------------------------------- 1 | 3 2 | (0 (1 "he")) 3 | (1 (2 "saw")) 4 | (2 (3 "me")) 5 | (0 (5 "she")) 6 | (5 (2 "studied")) 7 | -------------------------------------------------------------------------------- /carmel/sample/fsa4: -------------------------------------------------------------------------------- 1 | F 2 | (S (A "the")) 3 | (A (A "big")) 4 | (A (F "dog")) 5 | -------------------------------------------------------------------------------- /carmel/sample/fsa5: -------------------------------------------------------------------------------- 1 | F 2 | (S (A "the")) 3 | (S (A *e*)) 4 | (A (B "big")) 5 | (B (C "big")) 6 | (C (F "dog")) 7 | -------------------------------------------------------------------------------- /carmel/sample/fsa6: -------------------------------------------------------------------------------- 1 | F 2 | (S (A "big")) 3 | (A (B "big")) 4 | (B (C "dog")) 5 | (C (F "big")) 6 | -------------------------------------------------------------------------------- /carmel/sample/fsa7: -------------------------------------------------------------------------------- 1 | 3 2 | (0 (1 "they")) 3 | (1 (2 "can")) 4 | (2 (3 "fish")) 5 | -------------------------------------------------------------------------------- /carmel/sample/fsa8: -------------------------------------------------------------------------------- 1 | 3 2 | (0 1 "they") 3 | (0 2 "please") 4 | (1 2 "can") 5 | (2 3 "fish") 6 | -------------------------------------------------------------------------------- /carmel/sample/fsa9: -------------------------------------------------------------------------------- 1 | 1 2 | (0 (1)) 3 | -------------------------------------------------------------------------------- /carmel/sample/fst1: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 "big" "small")) 3 | (0 (0 "dog" "dog")) 4 | -------------------------------------------------------------------------------- /carmel/sample/fst2: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 "K" "c")) 3 | (0 (0 "AE" "a")) 4 | (0 (1 "SH" "s")) 5 | (1 (0 *e* "h")) 6 | -------------------------------------------------------------------------------- /carmel/sample/kevin_g: -------------------------------------------------------------------------------- 1 | 2 2 | (0 (1 "C" -150ln)) 3 | (0 (1 "V" -50log)) 4 | (1 (0 *e* 0.9)) 5 | (1 (2 *e* 0.1)) 6 | (2) 7 | -------------------------------------------------------------------------------- /carmel/sample/nested: -------------------------------------------------------------------------------- 1 | S 2 | #aasdf 3 | #A 4 | (S (S "PRO 5 | #" "they" -20log)) 6 | (S (S ("AUX" "can" -20ln) ("VERB" "can" 200log))) 7 | (S (S "NOUN" "fish" 0.0001)) 8 | (S (S "VERB" "fish" 0.0001)) 9 | -------------------------------------------------------------------------------- /carmel/sample/tag/README: -------------------------------------------------------------------------------- 1 | source is a fully connected tag bigram 2 | channel is a dictionary allowing a subset of part of speech tags for each word 3 | cipher is words (decipher into hidden tags) 4 | -------------------------------------------------------------------------------- /carmel/sample/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | for f in *fs*; do echo $f 4 | ../bin/$HOST/carmel $f -k 10 >/dev/null; done 5 | -------------------------------------------------------------------------------- /carmel/sample/tiny.sri: -------------------------------------------------------------------------------- 1 | 2 | \data\ 3 | ngram 1=5 4 | ngram 2=6 5 | ngram 3=2 6 | 7 | \1-grams: 8 | -3 a -.1 9 | -2 b -.2 10 | -4 c 11 | -99 -1.5 12 | -1 13 | 14 | \2-grams: 15 | -.5 b 16 | -1 b b -2 17 | -3 a b 18 | -5 a a 19 | -2 b a -8 20 | -.3 a -1 21 | 22 | \3-grams: 23 | -.1 a a 24 | -.2 b b a 25 | 26 | \end\ 27 | -------------------------------------------------------------------------------- /carmel/sample/tmp/a: -------------------------------------------------------------------------------- 1 | S 2 | (S (1 "she" 1.0)) 3 | (S (1 "he" 1.0 )) 4 | (1 (2 "can" 0.99)) 5 | (1 (2 "can" 0.01)) 6 | (2 (S "swing" 0.7)) 7 | (2 (S "dance" 0.7)) 8 | (2 (S "swing" 0.3)) 9 | -------------------------------------------------------------------------------- /carmel/sample/tmp/a.2: -------------------------------------------------------------------------------- 1 | F 2 | (S (A "the")) 3 | (A (A "big")) 4 | (A (F "dog")) 5 | -------------------------------------------------------------------------------- /carmel/sample/tmp/a.t: -------------------------------------------------------------------------------- 1 | S 2 | (S (1 "she" "SHE" 1.0)) 3 | (S (1 "he" "HE" 1.0 )) 4 | (1 (2 "can" "C" 0.99)) 5 | (1 (2 "can" "C" 0.01)) 6 | (2 (S "swing" "SSS" 0.7)) 7 | (2 (S "dance" "DDD" 0.7)) 8 | (2 (S "swing" "BBB" 0.3)) 9 | -------------------------------------------------------------------------------- /carmel/sample/tmp/a1: -------------------------------------------------------------------------------- 1 | 3 2 | (0 (1 *e*)) 3 | (0 (1 "he")) 4 | (1 (2 "saw" 0.8)) 5 | (2 (3 "me")) 6 | (1 (4 "ran" 0.2)) 7 | (4 (3 "home" 1.0)) 8 | (0 (5 "she")) 9 | (5 (3 "talked" 1.0)) 10 | -------------------------------------------------------------------------------- /carmel/sample/tmp/a2: -------------------------------------------------------------------------------- 1 | 3 2 | (0 (1 "he" 0.4)) 3 | (1 (2 "see" 0.8)) 4 | (2 (3 "me" 1.0)) 5 | (1 (4 "ran" 0.2)) 6 | (4 (3 "school" 1.0)) 7 | (0 (5 "she" 0.6)) 8 | (5 (3 "talked" 1.0)) 9 | (0 (6 "he" 0.5)) 10 | (6 (0 "dance" 0.9)) 11 | -------------------------------------------------------------------------------- /carmel/sample/tmp/a3: -------------------------------------------------------------------------------- 1 | S 2 | (S (1 "he" 1.0)) 3 | (S (1 "she" 1.0)) 4 | (1 (2 "could" 0.99)) 5 | (1 (2 "can" 0.19)) 6 | (1 (2 "could" 0.01)) 7 | (2 (S "dance" 0.7)) 8 | (2 (S "swing" 0.7)) 9 | (2 (S "dance" 0.3)) 10 | -------------------------------------------------------------------------------- /carmel/sample/tmp/a3.t: -------------------------------------------------------------------------------- 1 | S 2 | (S (1 "he" "HE" 1.0)) 3 | (S (1 "she" *e* 1.0)) 4 | (1 (2 "could" "SSS" 0.99)) 5 | (1 (2 "can" "C" 0.19)) 6 | (1 (2 "could" "C" 0.01)) 7 | (2 (S "dance" "C" 0.7)) 8 | (2 (S "swing" "C" 0.7)) 9 | (2 (S "dance" "C" 0.3)) 10 | (1 (3 "ran" "RAN" 0.3)) 11 | (3 (S "home" "HOME" 0.3)) 12 | -------------------------------------------------------------------------------- /carmel/sample/tmp/aa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/sample/tmp/aa -------------------------------------------------------------------------------- /carmel/sample/tmp/aaa: -------------------------------------------------------------------------------- 1 | 3 2 | (0 (1 "he")) 3 | (1 (2 "ran")) 4 | (2 (3 "home")) 5 | (3) 6 | -------------------------------------------------------------------------------- /carmel/sample/tmp/b1.wfsa: -------------------------------------------------------------------------------- 1 | 3 2 | (0 (1 "he" 0.4)) 3 | (1 (2 "see" 0.8)) 4 | (2 (3 "me" 1.0)) 5 | (1 (4 "ran" 0.2)) 6 | (4 (3 "school" 1.0)) 7 | (0 (5 "she" 0.6)) 8 | (5 (3 "talked" 1.0)) 9 | (0 (6 "he" 0.5)) 10 | (6 (0 "dance" 0.9)) 11 | -------------------------------------------------------------------------------- /carmel/sample/tmp/fsa1: -------------------------------------------------------------------------------- 1 | 3 2 | (0 (1 "he")) 3 | (1 (2 "saw")) 4 | (2 (3 "me")) 5 | (1 (4 "ran")) 6 | (4 (3 "home")) 7 | (0 (5 "she")) 8 | (5 (3 "talked")) 9 | -------------------------------------------------------------------------------- /carmel/sample/tmp/fsa2: -------------------------------------------------------------------------------- 1 | F 2 | (S (A "he")) 3 | (A (B "ran")) 4 | (B (F "home")) 5 | -------------------------------------------------------------------------------- /carmel/sample/tmp/fsa3: -------------------------------------------------------------------------------- 1 | 3 2 | (0 (1 "he")) 3 | (1 (2 "saw")) 4 | (2 (3 "me")) 5 | (0 (5 "she")) 6 | (5 (2 "studied")) 7 | -------------------------------------------------------------------------------- /carmel/sample/tmp/fsa4: -------------------------------------------------------------------------------- 1 | F 2 | (S (A "the")) 3 | (A (A "big")) 4 | (A (F "dog")) 5 | -------------------------------------------------------------------------------- /carmel/sample/tmp/fsa4.1: -------------------------------------------------------------------------------- 1 | F 2 | (S (A "the")) 3 | (A (A "big")) 4 | (A (F "dog")) 5 | -------------------------------------------------------------------------------- /carmel/sample/tmp/fsa5: -------------------------------------------------------------------------------- 1 | F 2 | (S (A "the")) 3 | (S (A *e*)) 4 | (A (B "big")) 5 | (B (C "big")) 6 | (C (F "dog")) 7 | -------------------------------------------------------------------------------- /carmel/sample/tmp/fsa6: -------------------------------------------------------------------------------- 1 | F 2 | (S (A "big")) 3 | (A (B "big")) 4 | (B (C "dog")) 5 | (C (F "big")) 6 | -------------------------------------------------------------------------------- /carmel/sample/tmp/fsa7: -------------------------------------------------------------------------------- 1 | 3 2 | (0 (1 "they")) 3 | (1 (2 "can")) 4 | (2 (3 "fish")) 5 | -------------------------------------------------------------------------------- /carmel/sample/tmp/fst1: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 "big" "small")) 3 | (0 (0 "dog" "dog")) 4 | -------------------------------------------------------------------------------- /carmel/sample/tmp/fst1.1: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 "the" "a")) 3 | (0 (0 "big" "small")) 4 | (0 (0 "dog" "dog")) 5 | -------------------------------------------------------------------------------- /carmel/sample/tmp/fst1.2: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 "a" "A")) 3 | (0 (0 "small" "SMALL")) 4 | (0 (0 "dog" "DOG")) 5 | -------------------------------------------------------------------------------- /carmel/sample/tmp/fst1.3: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 "the" "*")) 3 | (0 (0 "big" "**")) 4 | (0 (0 "dog" "***")) 5 | -------------------------------------------------------------------------------- /carmel/sample/tmp/fst2: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 "K" "c")) 3 | (0 (0 "AE" "a")) 4 | (0 (1 "SH" "s")) 5 | (1 (0 *e* "h")) 6 | -------------------------------------------------------------------------------- /carmel/sample/tmp/t: -------------------------------------------------------------------------------- 1 | S 2 | (S (1 "she" "PRO" 1.0)) 3 | (1 (2 "can" "AUX" 0.99)) 4 | (1 (2 "can" "VERB" 0.01)) 5 | (2 (S "swing" "NOUN" 0.7)) 6 | (2 (S "swing" "VERB" 0.3)) 7 | -------------------------------------------------------------------------------- /carmel/sample/tmp/t.1: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 "small" "SMALL")) 3 | (0 (0 "dog" "DOG")) 4 | -------------------------------------------------------------------------------- /carmel/sample/tmp/t1: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 "HE" "he" 0.4)) 3 | (0 (0 "SAW" "saw" 0.8)) 4 | (0 (0 "ME" "me" 1.0)) 5 | 6 | -------------------------------------------------------------------------------- /carmel/sample/tmp/t2: -------------------------------------------------------------------------------- 1 | S 2 | (S (1 "he" "PRO" 1.0)) 3 | (S (1 "she" "PRO" 1.0)) 4 | (1 (2 "could" "AUX" 0.99)) 5 | (1 (2 "can" "AUX" 0.59)) 6 | (1 (2 "could" "VERB" 0.01)) 7 | (2 (S "dance" "NOUN" 0.7)) 8 | (2 (S "swing" "NOUN" 0.7)) 9 | (2 (S "dance" "VERB" 0.3)) 10 | -------------------------------------------------------------------------------- /carmel/sample/tmp/wfsa1: -------------------------------------------------------------------------------- 1 | 3 2 | (0 (1 "he" 0.4)) 3 | (1 (2 "saw" 0.8)) 4 | (2 (3 "me" 1.0)) 5 | (1 (4 "ran" 0.2)) 6 | (4 (3 "home" 1.0)) 7 | (0 (5 "she" 0.6)) 8 | (5 (3 "talked" 1.0)) 9 | -------------------------------------------------------------------------------- /carmel/sample/tmp/wfsa2: -------------------------------------------------------------------------------- 1 | FINAL 2 | (START (PRO "PRO" 0.6)) 3 | (START (NOUN "NOUN" 0.3)) 4 | (START (AUX "AUX" 0.05)) 5 | (START (VERB "VERB" 0.05)) 6 | (PRO (AUX "AUX" 0.4)) 7 | (PRO (VERB "VERB" 0.6)) 8 | (NOUN (NOUN "NOUN" 0.7)) 9 | (NOUN (VERB "VERB" 0.3)) 10 | (AUX (VERB "VERB" 1.0)) 11 | (VERB (NOUN "NOUN" 1.0)) 12 | (VERB (FINAL *e* 1.0)) 13 | (AUX (FINAL *e* 1.0)) 14 | (NOUN (FINAL *e* 1.0)) 15 | (PRO (FINAL *e* 1.0)) 16 | -------------------------------------------------------------------------------- /carmel/sample/tmp/wfst1: -------------------------------------------------------------------------------- 1 | S 2 | (S (S "they" "PRO" 1.0)) 3 | (S (S "she" "PRO" 1.0)) 4 | (S (S "can" "AUX" 0.99)) 5 | (S (S "may" "AUX" 0.99)) 6 | (S (S "can" "VERB" 0.01)) 7 | (S (S "dance" "NOUN" 0.7)) 8 | (S (S "fish" "VERB" 0.3)) 9 | -------------------------------------------------------------------------------- /carmel/sample/tmp/wfst2: -------------------------------------------------------------------------------- 1 | S 2 | (S (S "PRO" "they" 0.07)) 3 | (S (S "AUX" "can" 0.21)) 4 | (S (S "VERB" "can" 0.00001)) 5 | (S (S "NOUN" "fish" 0.0001)) 6 | (S (S "VERB" "fish" 0.0001)) 7 | -------------------------------------------------------------------------------- /carmel/sample/tmp/wfst3: -------------------------------------------------------------------------------- 1 | S 2 | (S (S "she" "PRO" 1.0)) 3 | (S (S "could" "AUX" 0.99)) 4 | (S (S "can" "VERB" 0.01)) 5 | (S (S "ran" "NOUN" 0.7)) 6 | (S (S "fish" "VERB" 0.3)) 7 | (S (S "but" "VERB" 0.3)) 8 | (S (S "dance" "VERB" 0.3)) 9 | -------------------------------------------------------------------------------- /carmel/sample/tree-cascade/README: -------------------------------------------------------------------------------- 1 | The graphical "tree cascade" model you described can be learned by carmel. 2 | 3 | If x is a hidden string (POS tags + sentence boundaries for your corpus) with an untrained source model p(x), and there are 1 or more models p_i(z|x) and observed {z_i}, and the parameters of models p and p_0 are to be learned while the other p_i are known, then carmel can learn the best (MAP) model for p(x|{z_i}) = k*p(x)*prod_i{p(z_i|x)}, where k is constant since the {z_i} are all known. 4 | 5 | A script and some small models/data are attached. 6 | 7 | >From what I heard, you want to incorporate some (vague) expectation as to e.g. what portion in the whole corpus of tags are NN etc. Actually using an observation of e.g. 40000 NN in a large corpus will result in a huge p(x|z_NN) model, because the FSA would need at least 40000 states. A more exponential model would be more efficient. You can definitely just explicitly encode a p(x) multiplicative prior - just place it in the cascade and don't normalize it (--normby=...N...) or lock the arcs with "-N 0". I also wonder whether an additive prior might be good if you just want to bias the initialization a little (I presume to help with the identification problem with evaluating unsupervised tags/parses) 8 | 9 | To simultaneously train more than one of the conditional models would probably require modifying carmel or exporting to forest-em (the program I mentioned that handles derivation forests and more explicitly encodes the identity and normalization of parameters, rather than relying on carmel's odd "tied parameter group" facility). 10 | -------------------------------------------------------------------------------- /carmel/sample/tree-cascade/hidden.fsa: -------------------------------------------------------------------------------- 1 | FINAL 2 | (0 (Z *e* Z )) 3 | (0 (X *e* X )) 4 | (0 (Y *e* Y )) 5 | (0 (FINAL )) 6 | (Y (Y *e* Y )) 7 | (Y (X *e* X )) 8 | (Y (Z *e* Z )) 9 | (Y (FINAL )) 10 | (X (Y *e* Y )) 11 | (X (X *e* X )) 12 | (X (Z *e* Z )) 13 | (X (FINAL )) 14 | (Z (Y *e* Y )) 15 | (Z (X *e* X )) 16 | (Z (Z *e* Z )) 17 | (Z (FINAL )) 18 | 19 | -------------------------------------------------------------------------------- /carmel/sample/tree-cascade/observed0.data: -------------------------------------------------------------------------------- 1 | a a b c a b c b b a 2 | -------------------------------------------------------------------------------- /carmel/sample/tree-cascade/observed0.fst: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 X a )) 3 | (0 (0 Y a )) 4 | (0 (0 Z a )) 5 | (0 (0 X b )) 6 | (0 (0 Y b )) 7 | (0 (0 Z b )) 8 | (0 (0 X c )) 9 | (0 (0 Y c )) 10 | (0 (0 Z c )) 11 | 12 | -------------------------------------------------------------------------------- /carmel/sample/tree-cascade/observed1.data: -------------------------------------------------------------------------------- 1 | Z Z Z 2 | -------------------------------------------------------------------------------- /carmel/sample/tree-cascade/observed1.fst: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 X *e* )) 3 | (0 (0 Y *e* )) 4 | (0 (0 Z )) 5 | -------------------------------------------------------------------------------- /carmel/sample/tree-cascade/observed2.data: -------------------------------------------------------------------------------- 1 | Y 2 | -------------------------------------------------------------------------------- /carmel/sample/tree-cascade/observed2.fst: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 X *e* )) 3 | (0 (0 Y Y )) 4 | (0 (0 Z *e* )) 5 | 6 | -------------------------------------------------------------------------------- /carmel/sample/tree-cascade/s/hidden.fsa: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 *e* Z )) 3 | (0 (0 *e* X )) 4 | (0 (0 *e* Y )) 5 | 6 | -------------------------------------------------------------------------------- /carmel/sample/tree-cascade/s/observed0.data: -------------------------------------------------------------------------------- 1 | 2 | a a b c a b c b b a 3 | -------------------------------------------------------------------------------- /carmel/sample/tree-cascade/s/observed0.fst: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 X a )) 3 | (0 (0 Y a )) 4 | (0 (0 Z a )) 5 | (0 (0 X b )) 6 | (0 (0 Y b )) 7 | (0 (0 Z b )) 8 | (0 (0 X c )) 9 | (0 (0 Y c )) 10 | (0 (0 Z c )) 11 | 12 | -------------------------------------------------------------------------------- /carmel/sample/tree-cascade/s/observed1.data: -------------------------------------------------------------------------------- 1 | Z 2 | Z 3 | -------------------------------------------------------------------------------- /carmel/sample/tree-cascade/s/observed1.fst: -------------------------------------------------------------------------------- 1 | F 2 | (0 (0 X *e* )) 3 | (0 (0 Y *e* )) 4 | (0 (0 Z Z )) 5 | (0 (F) ) 6 | -------------------------------------------------------------------------------- /carmel/sample/tree-cascade/s/observed2.data: -------------------------------------------------------------------------------- 1 | 2 | Y 3 | -------------------------------------------------------------------------------- /carmel/sample/tree-cascade/s/observed2.fst: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 X *e* )) 3 | (0 (0 Y Y )) 4 | (0 (0 Z *e* )) 5 | 6 | -------------------------------------------------------------------------------- /carmel/sample/wfsa.perplexity: -------------------------------------------------------------------------------- 1 | 2 2 | (0 (1 .5) (2 .5)) 3 | -------------------------------------------------------------------------------- /carmel/sample/wfsa1: -------------------------------------------------------------------------------- 1 | 3 2 | (0 (1 "he" 0.4)) 3 | (1 (2 "saw" 0.8)) 4 | (2 (3 "me" 1.0)) 5 | (1 (4 "ran" 0.2)) 6 | (4 (3 "home" 1.0)) 7 | (0 (5 "she" 0.6)) 8 | (5 (3 "talked" 1.0)) 9 | -------------------------------------------------------------------------------- /carmel/sample/wfsa2: -------------------------------------------------------------------------------- 1 | FINAL 2 | (START (PRO "PRO" 0.6)) 3 | (START (NOUN "NOUN" 0.3)) 4 | (START (AUX "AUX" 0.05)) 5 | (START (VERB "VERB" 0.05)) 6 | (PRO (AUX "AUX" 0.4)) 7 | (PRO (VERB "VERB" 0.6)) 8 | (NOUN (NOUN "NOUN" 0.7)) 9 | (NOUN (VERB "VERB" 0.3)) 10 | (AUX (VERB "VERB" 1.0)) 11 | (VERB (NOUN "NOUN" 1.0)) 12 | (VERB (FINAL *e* 1.0)) 13 | (AUX (FINAL *e* 1.0)) 14 | (NOUN (FINAL *e* 1.0)) 15 | (PRO (FINAL *e* 1.0)) 16 | -------------------------------------------------------------------------------- /carmel/sample/wfsa3: -------------------------------------------------------------------------------- 1 | 1 2 | (0 (1 a .3) (1 e^-5) (1 () (b .5))) 3 | -------------------------------------------------------------------------------- /carmel/sample/wfsa4: -------------------------------------------------------------------------------- 1 | 1 2 | (0 (0 a -1log) (0 .5) (0) (1) (1 b)) 3 | (1 1 .5) 4 | -------------------------------------------------------------------------------- /carmel/sample/wfst1: -------------------------------------------------------------------------------- 1 | S 2 | (S (S "they" "PRO" 1.0)) 3 | (S (S "can" "AUX" 0.99)) 4 | (S (S "can" "VERB" 0.01)) 5 | (S (S "fish" "NOUN" 0.7)) 6 | (S (S "fish" "VERB" 0.3)) 7 | -------------------------------------------------------------------------------- /carmel/sample/wfst2: -------------------------------------------------------------------------------- 1 | S 2 | (S (N "they" "PRO" 1.0)) 3 | (N (Q "they" "PRO" 1.0)) 4 | (S (S "they" "PRO" 1.0)) 5 | (S (S "can" "AUX" 0.99)) 6 | (S (S "can" "VERB" 0.01)) 7 | (S (S "fish" "NOUN" 0.7)) 8 | (S (S "fish" "VERB" 0.3)) 9 | -------------------------------------------------------------------------------- /carmel/sample/wfst2.preprune: -------------------------------------------------------------------------------- 1 | S 2 | (S (N "they" "PRO" 1.0)) 3 | (N (Q "they" "PRO" 1.0)) 4 | (S (S "they" "PRO" 1.0)) 5 | (S (S "can" "AUX" 0.99)) 6 | (S (S "can" "VERB" 0.01)) 7 | (S (S "fish" "NOUN" 0.7)) 8 | (S (S "fish" "VERB" 0.3)) 9 | -------------------------------------------------------------------------------- /carmel/sample/wfst3: -------------------------------------------------------------------------------- 1 | 2 2 | (0 (2 0.1) (1 *e* b 0.6) (0 *e* a 0.3) (4 *e* a .1)) 3 | (1 (2 *e* a 0.3) (0 *e* b 0.7)) 4 | (2) 5 | (4 (2 *e* c)) 6 | -------------------------------------------------------------------------------- /carmel/sample/wfst3c: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 a c .6) (0 a d .4) (0 b d .2) (0 b e .8)) 3 | -------------------------------------------------------------------------------- /carmel/sample/wfstlog: -------------------------------------------------------------------------------- 1 | S 2 | (S (S "PRO" "they" -20log)) 3 | (S (S "AUX" "can" -20ln)) 4 | (S (S "VERB" "can" 200log)) 5 | (S (S "NOUN" "fish" 0.0001)) 6 | (S (S "VERB" "fish" 0.0001)) 7 | -------------------------------------------------------------------------------- /carmel/src/Makefile: -------------------------------------------------------------------------------- 1 | default: 2 | cd .. && make 3 | -------------------------------------------------------------------------------- /carmel/src/WARNING: -------------------------------------------------------------------------------- 1 | Carmel is the first significant C++ I wrote. It's pre-STL. I wouldn't write it this way today, but it's not worth a rewrite. Here are some things to watch out for: 2 | 3 | I wrote a node-based hashtable which means it's safe to directly contain a singly linked list by value (because nodes are never copied in normal hashtable operation). This means you'd have horrible performance on grows if you switched to open hashing. The gnu hashtable is also node-based. 4 | 5 | arcs' groupId field is reused as an index for other purposes (e.g. tracking lists of original cascade arcs in a composition, associating arcs w/ gibbs parameter ids) 6 | 7 | Command line option processing is gross. I'd use boost program_options today. --long options aren't spelling checked / closed-class. 8 | 9 | singly linked lists were used for arcs and other things, so as to be relatively memory-concise, and mutable. a growing array (e.g. std::vector) might be better cache-locality and performance, and certainly lower space if compacted after e.g. reading lists from file (size won't change after that). 10 | 11 | -------------------------------------------------------------------------------- /carmel/src/models.h: -------------------------------------------------------------------------------- 1 | #ifndef MODELS_H 2 | #define MODELS_H 1 3 | #include 4 | #include 5 | 6 | char *ModelsDef[] = { 7 | "0 (0 (0 \"A\" \"A\" 0.75) (0 \"AA\" \"A\" 0.25) (0 \"B\" \"B\" 0.67) (0 \"BB\" \"B\" 0.33))", 8 | "0 (0 (0 \"A\" \"a\") (0 \"B\" \"b\"))" 9 | }; 10 | // define additional models if needed 11 | 12 | 13 | vector Models ; 14 | void initModels() 15 | { 16 | // Similarly, add additional models if necessary 17 | int n_models = sizeof(ModelsDef)/sizeof(char *); 18 | for (int i=0;i 4 | using namespace std; 5 | 6 | main() 7 | { 8 | List l; 9 | Weight a,b; 10 | insert_iterator > o(l,l.begin()); 11 | for (;;) { 12 | 13 | cin >> a >> b; 14 | if (cin) { 15 | *o++ = a; 16 | *o++ = b; 17 | Weight::out_ln(cout);Weight::out_always_real(cout); 18 | cout << "a=" << a << " b=" << b << " a*b=" << a*b << " a/b=" << a/b << " a+b=" << a+b << " a-b=" << a-b << endl; 19 | Weight::out_always_log(cout); 20 | cout << "a=" << a << " b=" << b << " a*b=" << a*b << " a/b=" << a/b << " a+b=" << a+b << " a-b=" << a-b << endl; 21 | Weight::out_variable(cout); 22 | cout << "a=" << a << " b=" << b << " a*b=" << a*b << " a/b=" << a/b << " a+b=" << a+b << " a-b=" << a-b << endl; 23 | } else 24 | break; 25 | } 26 | cout << "\n"; 27 | for (List::iterator i=l.begin();i!=l.end();++i) 28 | cout << *i << " "; 29 | cout << "\n"; 30 | cout << "\n"; 31 | l.reverse(); 32 | for (List::const_iterator i=l.const_begin(),end=l.const_end();i!=end;++i) 33 | cout << *i << " "; 34 | cout << "\n"; 35 | 36 | } 37 | -------------------------------------------------------------------------------- /carmel/test/N.cascade.train.gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | carmel=${carmel:-carmel} 4 | N=${N:-100} 5 | M=${M:-5} 6 | function safefilename { 7 | echo "$@" | perl -pe 's/\W+/./g' 8 | } 9 | comp=comp.`safefilename $*` 10 | corp=.corpus.$comp.$N 11 | $carmel "$@" > $comp 12 | $carmel -g $N $comp > $corp 13 | uchain= 14 | for f in $*; do 15 | $carmel -n --constant-weight=1 $f > $f.u 16 | uchain+=" $f.u" 17 | done 18 | $carmel -S $corp $uchain >/dev/null 19 | $carmel -S $corp $comp >/dev/null 20 | $carmel -M $M --train-cascade $ARGS $corp $uchain 21 | for f in $*; do 22 | echo original: 23 | $carmel $f 24 | echo trained: 25 | $carmel $f.u.trained 26 | done 27 | -------------------------------------------------------------------------------- /carmel/test/angela.knight.kbest.wfst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/test/angela.knight.kbest.wfst -------------------------------------------------------------------------------- /carmel/test/asciikana-katakana.transducer: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/test/asciikana-katakana.transducer -------------------------------------------------------------------------------- /carmel/test/bad.-a.1: -------------------------------------------------------------------------------- 1 | 2 2 | (0 (2 "A" "b" .2) (2 "B" "b" 0.5) (1 "C" *e* 0.5)) 3 | (1 (2 "D" "b" 0.5) (0 "E" "a" 0.5)) 4 | (2) 5 | -------------------------------------------------------------------------------- /carmel/test/bad.-a.2: -------------------------------------------------------------------------------- 1 | F 2 | (S (S "a" "X" 0.5) (S "b" "Y" 0.3) (F *e* "Z" .2)) 3 | (F) 4 | -------------------------------------------------------------------------------- /carmel/test/compose-test.sh: -------------------------------------------------------------------------------- 1 | $B -rsim jpron.transducer vowel-separator.transducer jpron-asciikana.transducer asciikana-katakana.transducer < test.katakana 2 | -------------------------------------------------------------------------------- /carmel/test/determinize.usr.dict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | input=${input:-/usr/share/dict/words} 4 | if [ ! "$skipdict" ] ; then 5 | ../make-dictionary.pl -r $input > dict.words.char.fsa.random 6 | carmel -jn dict.words.char.fsa.random > dict.words.char.fsa 7 | cp dict.words.char.fsa looped.dict.words.char.fsa 8 | echo '(END 0 " ")' >> looped.dict.words.char.fsa 9 | fi 10 | carmel --minimize --minimize-determinize $* dict.words.char.fsa -F det.dict.fsa 11 | carmel --minimize --minimize-determinize $* looped.dict.words.char.fsa -F det.looped.dict.fsa 12 | carmel -kO 20 det.dict.fsa 13 | carmel -kO 20 dict.words.char.fsa 14 | carmel -kO 20 det.looped.dict.fsa 15 | carmel -kO 20 looped.dict.words.char.fsa 16 | -------------------------------------------------------------------------------- /carmel/test/empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/test/empty -------------------------------------------------------------------------------- /carmel/test/fsa7: -------------------------------------------------------------------------------- 1 | 3 2 | (0 (1 "they")) 3 | (1 (2 "can")) 4 | (2 (3 "fish")) 5 | -------------------------------------------------------------------------------- /carmel/test/jpron.transducer: -------------------------------------------------------------------------------- 1 | START 2 | (START (START ("PAUSE") ("A") ("E") ("I") ("O") ("U") ("N")) 3 | (C1 ("N") ("NN") ("K") ("KK") ("S") ("SS") ("SH") ("SSH") ("T") 4 | ("TT") ("D" 0.05) ("DD" 0.05) ("TS") ("TTS") ("M") ("MM") ("R") ("RR") 5 | ("G") ("GG") ("Z") ("ZZ") ("J") ("JJ") ("F" 0.05) ("FF" 0.05) 6 | ("CH") ("TCH") ("B") ("BB") ("P") ("PP") ("H") ("HH") ("V" 0.05)) 7 | (C2 ("NN") ("K") ("KK") ("S") ("SS") ("SH") ("SSH") ("T") 8 | ("TT") ("TS") ("TTS") ("M") ("MM") ("R") ("RR") 9 | ("G") ("GG") ("Z") ("ZZ") ("J") ("JJ") ("F" 0.05) ("FF" 0.05) 10 | ("CH") ("TCH") ("B") ("BB") ("P") ("PP") ("W") ("Y") ("H") ("HH") ("V" 0.05)) 11 | (C3 ("K") ("KK") ("S") ("SS")) 12 | (D ("D") ("DD"))) 13 | (C1 (C2 ("Y"))) 14 | (C2 (START ("A") ("E") ("I") ("O") ("U"))) 15 | (C3 (C2 ("W" 0.05))) 16 | (D (START ("A") ("E") ("I" 0.05) ("O") ("U" 0.05))) 17 | -------------------------------------------------------------------------------- /carmel/test/kbest.small.cycle: -------------------------------------------------------------------------------- 1 | 2 2 | (0 (0 *e* b .033333) (2 0.3) (1 0.333333) (0 *e* a 0.333333)) 3 | (1 (2 *e* b 0.5) (0 0.5)) 4 | (2 (0 .1) (1 .9)) 5 | -------------------------------------------------------------------------------- /carmel/test/prune.test: -------------------------------------------------------------------------------- 1 | 01 2 | (00 (10 .1)) 3 | (10 (11 .1) (20 .1) (01 .1)) 4 | (20 (21 .1) (11 .1)) 5 | (11 (01 .1) (10 .1)) 6 | (21 (11 .1) (20 .1)) 7 | (01 (00 .1)) 8 | -------------------------------------------------------------------------------- /carmel/test/runtests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd `dirname $0` 3 | B=${1:-../bin/macosx/carmel} 4 | which $B 5 | mkdir -p logs 6 | log=logs/tests.`basename $B`.`date +%C%y%m%d_%H:%M` 7 | (echo $B;ls -l $B;uname -a;hostname; time . traintest.sh;time $B -IEQ -k 1000 angela.knight.kbest.wfst;time . j-test-jap ) 2>&1 | tee $log 8 | ln -sf $log latest.log 9 | echo 10 | echo `pwd`/latest.log 11 | -------------------------------------------------------------------------------- /carmel/test/test.asciikana: -------------------------------------------------------------------------------- 1 | "a" "n" "ji" "ra" "na" "i" "to" 2 | -------------------------------------------------------------------------------- /carmel/test/test.compose.-a.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ~/isd/hints/aliases.sh 3 | B=${B:-carmel} 4 | N=${N:-10} 5 | a=$1 6 | b=${2:-args: xdcr1 xdcr2 (for composition)} 7 | set -x 8 | set -e 9 | $B -N 100000 $a > $a.g 10 | $B -N 200000 $b > $b.g 11 | $B -m $a $b > $a.comp.$b 12 | $B -am $a.g $b.g > $a.comp.-a.$b 13 | $B -@k $N $a.comp.$b > $a.composed.best 14 | $B -k $N $a.comp.$b > $a.composed.paths 15 | $B -S $a.composed.best $a.comp.$b 16 | $B -S $a.composed.best $a.comp.-a.$b 17 | -------------------------------------------------------------------------------- /carmel/test/test.epron: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/test/test.epron -------------------------------------------------------------------------------- /carmel/test/test.final: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/test/test.final -------------------------------------------------------------------------------- /carmel/test/test.jpron2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/test/test.jpron2 -------------------------------------------------------------------------------- /carmel/test/test.kana: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/test/test.kana -------------------------------------------------------------------------------- /carmel/test/test.katakana: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/test/test.katakana -------------------------------------------------------------------------------- /carmel/test/test.word: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/test/test.word -------------------------------------------------------------------------------- /carmel/test/train.a: -------------------------------------------------------------------------------- 1 | fin 2 | (start (start *e* a) (1 *e* b) (fin) (fin *e* d 0) (fin *e* c)) 3 | (1 (start *e* b) (fin *e* a)) 4 | (fin) 5 | -------------------------------------------------------------------------------- /carmel/test/train.a.u: -------------------------------------------------------------------------------- 1 | 2 2 | (0 (2 0.333333) (1 *e* b 0.333333) (0 *e* a 0.333333)) 3 | (1 (2 *e* a 0.5) (0 *e* b 0.5)) 4 | (2) 5 | -------------------------------------------------------------------------------- /carmel/test/train.a.w: -------------------------------------------------------------------------------- 1 | 2 2 | (0 (2 0.1) (1 *e* b 0.6) (0 *e* a 0.3)) 3 | (1 (2 *e* a 0.3) (0 *e* b 0.7)) 4 | (2) 5 | -------------------------------------------------------------------------------- /carmel/test/train.cascade.gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | carmel=${carmel:-carmel} 4 | fst1=${1:?arg 1: a wfst} 5 | fst2=${2:?arg 2: a wfst to compose w/ fst 1} 6 | shift 7 | shift 8 | N=${N:-100} 9 | M=${M:-5} 10 | comp=$fst1.comp.$fst2 11 | corp=.corpus.$comp.$N 12 | $carmel $fst1 $fst2 > $comp 13 | $carmel -g $N $comp > $corp 14 | $carmel -n --constant-weight=1 $fst1 > $fst1.u 15 | $carmel -n --constant-weight=1 $fst2 > $fst2.u 16 | $carmel -S $corp $fst1.u $fst2.u >/dev/null 17 | $carmel -S $corp $comp >/dev/null 18 | $carmel -M $M --train-cascade $* $corp $fst1.u $fst2.u 19 | echo trained: 20 | $carmel $fst1.u.trained 21 | $carmel $fst2.u.trained 22 | echo original: 23 | $carmel $fst1 24 | $carmel $fst2 25 | 26 | -------------------------------------------------------------------------------- /carmel/test/train.self.gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | carmel=${carmel:-carmel} 4 | fst=${1:?arg 1: a wfst e.g. train.a.w} 5 | shift 6 | N=${N:-100} 7 | $carmel -g $N $fst > corpus.$fst.$N 8 | $carmel --constant-weight=1 $fst > $fst.u 9 | $carmel -S corpus.$fst.$N $fst.u >/dev/null 10 | $carmel -F $fst.trained.self.gen -t $* corpus.$fst.$N $fst.u 11 | #echo original: 12 | #$carmel $fst 13 | 14 | -------------------------------------------------------------------------------- /carmel/test/traintest.sh: -------------------------------------------------------------------------------- 1 | cmd="$B -o 1.1 -M 4 -F span.spell.trained2 -t span.spell.corpus span.spell.wfst" 2 | #$B -o 1.1 -M 3 -F span.spell.trained2 -t span.spell.corpus span.spell.wfst 3 | echo $cmd 4 | $cmd 5 | echo $cmd 6 | -------------------------------------------------------------------------------- /carmel/test/vowel-separator.transducer: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (a "A" "AA") (i "I" "II") (o "O" "OO") (u "U" "UU") (e "E" "EE") 3 | (0 "A" "A" 0.99) (0 "B" "B") (0 "BB" "BB") (0 "CH" "CH") (0 "D" "D") (0 "DD" "DD") 4 | (0 "E" "E" 0.99) (0 "G" "G") (0 "GG" "GG") (0 "H" "H") (0 "HH" "HH") (0 "I" "I" 0.99) 5 | (0 "J" "J") (0 "JJ" "JJ") (0 "K" "K") (0 "KK" "KK") (0 "M" "M") (0 "MM" "MM") 6 | (0 "N" "N") (0 "NN" "NN") (0 "O" "O" 0.99) (0 "P" "P") (0 "PAUSE" "PAUSE") 7 | (0 "PP" "PP") (0 "R" "R") (0 "RR" "RR") (0 "S" "S") (0 "SH" "SH") (0 "SS" "SS") 8 | (0 "SSH" "SSH") (0 "T" "T") (0 "TCH" "TCH") (0 "TS" "TS") (0 "TT" "TT") 9 | (0 "TTS" "TTS") (0 "U" "U" 0.99) (0 "V" "V") (0 "W" "W") (0 "Y" "Y") 10 | (0 "F" "F") (0 "FF" "FF") (0 "Z" "Z") (0 "ZZ" "ZZ")) 11 | (a (0 "A" *e*)) 12 | (i (0 "I" *e*)) 13 | (e (0 "E" *e*)) 14 | (o (0 "O" *e*)) 15 | (u (0 "U" *e*)) 16 | -------------------------------------------------------------------------------- /carmel/test/wfst2: -------------------------------------------------------------------------------- 1 | S 2 | (S (S "PRO" "they" 0.07)) 3 | (S (S "AUX" "can" 0.21)) 4 | (S (S "VERB" "can" 0.00001)) 5 | (S (S "NOUN" "fish" 0.0001)) 6 | (S (S "VERB" "fish" 0.0001)) 7 | -------------------------------------------------------------------------------- /carmel/test/word-epron.names.55000wds.transducer: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/carmel/test/word-epron.names.55000wds.transducer -------------------------------------------------------------------------------- /carmel/test/y.data: -------------------------------------------------------------------------------- 1 | 2 | "a" "a" "a" "b" "a" "a" "b" "a" "a" "b" 3 | -------------------------------------------------------------------------------- /carmel/test/y1.new: -------------------------------------------------------------------------------- 1 | 2 2 | (1 (2 *e* "C" 0.2!) (1 *e* "V" 0.8!)) 3 | (2 (2 *e* "C" 0.2!) (1 *e* "V" 0.8!)) 4 | -------------------------------------------------------------------------------- /carmel/test/y2.new: -------------------------------------------------------------------------------- 1 | 0 2 | (0 (0 "C" "b" 0.5!100) (0 "C" "a" 0.5!101) (0 "V" "b" 0.5!102) (0 "V" "a" 0.5!103)) 3 | -------------------------------------------------------------------------------- /carmel/test/y4.new: -------------------------------------------------------------------------------- 1 | 4 2 | (0 (1 *e* *e* 0.2!)) 3 | (0 (2 *e* *e* 0.8!)) 4 | (2 (0 *e* "a" 0.5!103)) 5 | (2 (0 *e* "b" 0.5!102)) 6 | (1 (4 *e* "a" 0.5!101)) 7 | (1 (4 *e* "b" 0.5!100)) 8 | (4 (3 *e* *e* 0.2!)) 9 | (4 (5 *e* *e* 0.8!)) 10 | (3 (0 *e* "a" 0.5!103)) 11 | (3 (0 *e* "b" 0.5!102)) 12 | (5 (4 *e* "a" 0.5!101)) 13 | (5 (4 *e* "b" 0.5!100)) 14 | -------------------------------------------------------------------------------- /cipher/baseline.2.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | open A,"test.freq" or die; 5 | open B,"train.freq" or die; 6 | 7 | my $N=0; 8 | my $right=0; 9 | my $Ndict=0; 10 | my $rightdict=0; 11 | 12 | while() { 13 | my $a=$_; 14 | my $b=; 15 | last unless defined $b; 16 | my ($na,$wa)=split ' ',$a; 17 | my ($nb,$wb)=split ' ',$b; 18 | $Ndict++; 19 | $N+=$na; 20 | if ($wa eq $wb) { 21 | print STDERR $a; 22 | $right+=$na; 23 | $rightdict++; 24 | } 25 | } 26 | 27 | print "per-word ($rightdict correct out of $Ndict unique test words) accuracy: ",$rightdict/$Ndict,"\n"; 28 | print "per-running-text (out of $N running test words) accuracy: ",$right/$N,"\n"; 29 | 30 | -------------------------------------------------------------------------------- /cipher/carmel-quote-words: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | # use for all single words as tokens in FSA 5 | sub escape_for_carmel 6 | { 7 | my ($s)=@_; 8 | $s =~ s/([\"])/\$1/og; 9 | return qq{"$s"}; 10 | } 11 | 12 | while(<>) { 13 | s/(\S+)/&escape_for_carmel($1)/oge; 14 | print; 15 | } 16 | 17 | -------------------------------------------------------------------------------- /cipher/class-features: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # needs carmel binaries in PATH or in carmel env var 3 | 4 | d=`dirname $0` 5 | nclass=${nclass:-4} 6 | class="class$nclass" 7 | maxorder=${maxorder:-2} 8 | lmsuf=${maxorder}gram 9 | test=${test:-test} 10 | train=${train:-train} 11 | chan=$class/class-channel.$train.$test 12 | tchan=${tchan:-$chan.$lmsuf} 13 | 14 | if [ "$FLOOR" ] ; then 15 | echo class FLOOR=$FLOOR 16 | fi 17 | 18 | fbase=$class/feats.$lmsuf 19 | ftrain=$fbase.train 20 | ftest=$fbase.test 21 | 22 | set -x 23 | $d/unigram-freq-bands $train $ftrain.uni < $test > $ftest.uni && \ 24 | $d/soft-classes $class/$train < $ftrain.uni > $ftrain && \ 25 | $d/soft-classes $class/$test $tchan < $ftest.uni > $ftest 26 | rm $ftrain.uni $ftest.uni 27 | -------------------------------------------------------------------------------- /cipher/class-ngrams: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nclass=${nclass:-4} 4 | class="class$nclass" 5 | mkdir -p $class 6 | test=${test:-test} 7 | train=${train:-train} 8 | texts="$test $train" 9 | #texts=${texts:-train test test.cipher} 10 | echo making lm classes for \"$texts\" 11 | for t in $texts ; do 12 | ngram-class -numclasses $nclass -text $t -class-counts $class/$t.counts -classes $class/$t 13 | done 14 | -------------------------------------------------------------------------------- /cipher/class-word-fst: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | my $S="S"; 5 | 6 | print "$S\n"; 7 | print "($S\n"; 8 | 9 | my $invert=$ENV{INVERT}; # fst takes word->class if set. 10 | my $floor=$ENV{FLOOR}; # drop lines w/ p) { 21 | my ($class,$p,$w)=split; 22 | next if defined $floor && $p < $floor; 23 | $w=escape_for_carmel($w); 24 | if ($invert) { 25 | my $t=$w; 26 | $w=$class; 27 | $class=$t; 28 | } 29 | print " ($S $class $w $p)\n"; 30 | } 31 | 32 | print ")\n"; 33 | -------------------------------------------------------------------------------- /cipher/class-word-wfst: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | my $S="S"; 5 | 6 | print "$S\n"; 7 | print "($S\n"; 8 | 9 | my $invert=$ENV{INVERT}; # fst takes word->class if set. 10 | my $floor=$ENV{FLOOR}; # drop lines w/ p) { 21 | my ($class,$p,$w)=split; 22 | next if defined $floor && $p < $floor; 23 | $w=escape_for_carmel($w); 24 | if ($invert) { 25 | my $t=$w; 26 | $w=$class; 27 | $class=$t; 28 | } 29 | print " ($S $class $w $p)\n"; 30 | } 31 | 32 | print ")\n"; 33 | -------------------------------------------------------------------------------- /cipher/count-ngrams: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | d=`dirname $0` 3 | maxorder=${maxorder:-3} 4 | texts=${texts:-train test test.cipher} 5 | out=${out:-lms} 6 | mkdir -p $out 7 | echo making lms and counts for texts \"$texts\" up to ngram order \"$maxorder\" 8 | for t in $texts ; do 9 | writearg="-write $out/$t.counts" 10 | for N in `seq 1 $maxorder` ; do 11 | writearg="$writearg -write$N $out/$t.counts$N" 12 | done 13 | trainlm=$out/$t.${maxorder}gram 14 | echo counting for $t: ngram-count -order $maxorder -unk -sort -text $t $writearg -lm $trainlm 15 | ngram-count -order $maxorder -unk -sort -text $t $writearg -lm $trainlm 16 | for N in `seq 1 $maxorder` ; do 17 | sort -rnk $((N+1)) $out/$t.counts$N > $out/$t.sortcounts$N 18 | done 19 | 20 | NOQUOTE= CHECK_SUFFIX= EOS= $d/sri2fsa.pl $trainlm | carmel -Ns 0 > $trainlm.fst 21 | 22 | done 23 | -------------------------------------------------------------------------------- /cipher/decipher: -------------------------------------------------------------------------------- 1 | encipher -------------------------------------------------------------------------------- /cipher/decipher-classes: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # TODO: train on unclassed ciphertext w/ soft clusters in ilne? 3 | # needs carmel binaries in PATH 4 | d=`dirname $0` 5 | maxorder=${maxorder:-2} 6 | N=$maxorder 7 | lmsuf=${N}gram 8 | test=${test:-test} 9 | train=${train:-train} 10 | texts="$test $train" 11 | nclass=${nclass:-4} 12 | class="class$nclass" 13 | mkdir -p $class 14 | 15 | echo producing trained carmel channel $tchan from train $train and test $test 16 | 17 | set -x 18 | for t in $texts ; do 19 | c=$class/$t 20 | $d/text-to-classes $c $t > $c.classtext 21 | done 22 | 23 | ct=$class/$train 24 | trainlm=$ct.$lmsuf 25 | classfsa=$trainlm.fsa 26 | classfst=$trainlm.fst 27 | ngram-count -order $N -sort -text $ct.classtext -lm $trainlm 28 | NOQUOTE=1 CHECK_SUFFIX= EOS= $d/sri2fsa.pl $trainlm > $classfst 29 | carmel -N 0 --project-right $classfst > $classfsa 30 | chan=$class/class-channel.$train.$test 31 | tchan=$chan.$lmsuf 32 | uchan=$chan.untrained 33 | 34 | ctest=$class/$test 35 | $d/full-class-channel $ct $ctest > $uchan 36 | QUOTE= $d/epsilon-string-pairs $ctest.classtext | carmel $trainopt -sta --train-cascade $classfsa $uchan 37 | mv $uchan.trained $tchan 38 | echo trained class channel in $tchan 39 | 40 | 41 | #cw=$test.class.word 42 | #INVERT= class-word-fst $ctest > $cw 43 | #QUOTE=1 $d/epsilon-string-pairs $test | carmel $trainopt -sta --train-cascade $classfsa $uchan $cw 44 | #mv $uchan.trained $tchan 45 | #tcw=class-cipherword.$lmsuf 46 | #mv $cw $tcw 47 | 48 | 49 | #echo adjusted cipher class-word soft clusters in $tcw 50 | #echo fixed cipher class-word trained class channel in $tchan.fixed.class -------------------------------------------------------------------------------- /cipher/do-classes: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # needs carmel binaries in PATH 3 | d=`dirname $0` 4 | nclass=${nclass:-4} 5 | class="class$nclass" 6 | mkdir -p $class 7 | $d/class-ngrams 8 | $d/decipher-classes 9 | $d/eval-classes 10 | $d/class-features -------------------------------------------------------------------------------- /cipher/encipher: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # not crypto secure ;) reverses order of printable nonspace ascii (run again to 4 | # get plaintext) 5 | 6 | use strict; 7 | 8 | my $ne=127; # DEL (127) 9 | my $n0=32; # ascii space, not reversed 10 | 11 | my %t; 12 | 13 | for my $o (0..255) { 14 | my $c=chr($o); 15 | if ($o<$ne && $o>$n0) { 16 | $o=$ne-($o-$n0); 17 | } 18 | $t{$c}=chr($o); 19 | } 20 | 21 | while(<>) { 22 | s#(.)#$t{$1}#g; 23 | print; 24 | } 25 | -------------------------------------------------------------------------------- /cipher/epsilon-string-pairs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | my $QUOTE=$ENV{QUOTE}; 5 | 6 | while(<>) { 7 | print "\n"; 8 | if ($QUOTE) { 9 | s/(\S+)/"$1"/og; 10 | } 11 | print; 12 | } 13 | -------------------------------------------------------------------------------- /cipher/filter_docid: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | while(<>) { 4 | s/^\S+\s*/ /; 5 | s/ \%(\S*)\b/++$spec{$1};''/ge; 6 | print; 7 | } 8 | 9 | for (sort keys %spec) { 10 | print STDERR "removed $spec{$_} $_\n"; 11 | } 12 | -------------------------------------------------------------------------------- /cipher/full-class-channel: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | my %c1; 5 | my %c2; 6 | 7 | 8 | sub get_classes { 9 | my ($h,$f)=@_; 10 | open F,"<",$f or die; 11 | while () { 12 | my ($class)=split ' ',$_,2; 13 | ++$h->{$class}; 14 | } 15 | } 16 | 17 | get_classes(\%c1,shift); 18 | get_classes(\%c2,shift); 19 | 20 | my @c2=sort keys %c2; 21 | my $n2=scalar @c2; 22 | my $pcond=1./$n2; 23 | 24 | my $S="S"; 25 | 26 | print "$S\n"; 27 | print "($S\n"; 28 | 29 | for my $k1 (sort keys %c1) { 30 | for (@c2) { 31 | print " ($S $k1 $_ $pcond)\n"; 32 | } 33 | } 34 | 35 | print ")\n"; 36 | -------------------------------------------------------------------------------- /cipher/split-words: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -n 2 | split; 3 | print $_,"\n" for @_; 4 | -------------------------------------------------------------------------------- /cipher/sri2fsa.pl: -------------------------------------------------------------------------------- 1 | ../carmel/src/sri2fsa.pl -------------------------------------------------------------------------------- /cipher/summary-classes: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for f in $*; do 3 | echo 4 | echo ==================== 5 | echo $f 6 | tail -n 2 $f/class-channel.train.test.*gram.accuracy.top-* | grep -v Conditional 7 | done -------------------------------------------------------------------------------- /cipher/text-to-classes: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | my %c; 5 | 6 | my $UNKTO=$ENV{UNKTO}; 7 | 8 | open C,"<",shift or die; 9 | while() { 10 | my ($class,$p,$w)=split; 11 | $c{$w}=$class; 12 | } 13 | 14 | $UNKTO="the" unless defined $UNKTO; 15 | 16 | my $unkto=$c{$UNKTO}; 17 | print STDERR "text-to-classes assigning unknown words to the class for '${UNKTO}' ($unkto).\n" if $unkto; 18 | 19 | while(<>) { 20 | s/(\S+)/$c{$1} || $unkto || die "class missing for word $1 (UNKTO=$UNKTO unkto_class=$unkto)"/oge; 21 | print; 22 | } 23 | 24 | -------------------------------------------------------------------------------- /cipher/word-freq: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | d=`dirname $0` 3 | 4 | if [ "$top" ] ; then 5 | $d/split-words $* | sort | uniq -c | sort -rn | head -n $top 6 | else 7 | $d/split-words $* | sort | uniq -c | sort -rn 8 | fi 9 | -------------------------------------------------------------------------------- /clm/clm-jan-09.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/clm/clm-jan-09.pdf -------------------------------------------------------------------------------- /clm/e-parse-yield.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | #input: one per line ghkm-format trees ... (NN dog) (-LRB- () (-RRB- )) 3 | #output: one per line yield ... dog ( ) 4 | my $DEBUG=$ENV{DEBUG}; 5 | while(<>) { 6 | my $sp=''; 7 | while (/\(([^() ]+) ([^ ]+)\)( |$)/g) { 8 | my ($pos,$lex)=($1,$2); 9 | print STDERR "($pos $lex) " if $DEBUG; 10 | print "$sp$lex"; 11 | $sp=' '; 12 | } 13 | print STDERR "\n" if $DEBUG; 14 | print "\n"; 15 | } 16 | -------------------------------------------------------------------------------- /clm/shen08.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/clm/shen08.pdf -------------------------------------------------------------------------------- /clm/uniq_srilm.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # input: srilm 3 | # optional env: order (checks just that order). nodup: skip dup check. print: print events (without prob/bo) 4 | # output: die on duplicates ngram events (no output, no exit code = no duplicates) 5 | 6 | use warnings; 7 | my $order=$ENV{order}; 8 | my $print=$ENV{print}; 9 | my $dup=!$ENV{nodup}; 10 | 11 | my %ctx; 12 | my $N=0; 13 | while(<>) { 14 | if (/^\\(\d+)-grams:\s*$/o) { 15 | $N=$1; 16 | print STDERR "starting $N-grams...\n"; 17 | } elsif (/^\\end\\$/) { 18 | $N=0; 19 | %ctx=(); 20 | } elsif ($N==0 || ($order&&$order!=$N) || /^\s*$/ ) { 21 | } else { 22 | my @w=split; 23 | my $ctx=join(' ',@w[1..$N]); 24 | if ($dup) { 25 | die "DUPLICATE ($ARGV): $ctx :\n$_ " if exists $ctx{$ctx}; 26 | $ctx{$ctx}=1; 27 | } 28 | print $ctx,"\n" if $print; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /forest-em/.gitignore: -------------------------------------------------------------------------------- 1 | forest-em.README.hpp 2 | -------------------------------------------------------------------------------- /forest-em/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(forest-em) 2 | our_boost_libs(program_options serialization system filesystem 3 | random chrono timer iostreams filesystem 4 | unit_test_framework 5 | ) 6 | find_package(zstd QUIET) 7 | message("zstd found?: ${zstd_FOUND}") 8 | set(zstdlib) 9 | if (zstd_FOUND) 10 | # at least with mac+brew boost, iostreams appears to pull in zstd, lzma, z, bz2 libs; -lzstd wasn't found 11 | link_directories("/usr/local/lib") 12 | list(APPEND zstdlib zstd::libzstd_static) 13 | endif() 14 | add_executable(text-to-cc ../graehl/shared/text-to-cc.cpp) 15 | include_directories(${PROJECT_SOURCE_DIR}) 16 | set(README_IN forest-em.README) 17 | set(README_GEN_H ${PROJECT_SOURCE_DIR}/forest-em.README.hpp) 18 | add_custom_command(OUTPUT ${README_GEN_H} 19 | INPUT ${README_IN} 20 | COMMAND text-to-cc usage_str < ${PROJECT_SOURCE_DIR}/${README_IN} > ${README_GEN_H} 21 | VERBATIM 22 | ) 23 | add_executable(forest-em forest-em.cpp) 24 | set_property(SOURCE forest-em.cpp APPEND PROPERTY OBJECT_DEPENDS ${README_GEN_H}) 25 | set_property(TARGET forest-em APPEND PROPERTY OBJECT_DEPENDS ${README_GEN_H}) 26 | add_executable(forestviz forestviz.cpp) 27 | 28 | target_link_libraries(forest-em Boost::timer Boost::random Boost::iostreams Boost::program_options ${zstdlib}) 29 | target_link_libraries(forestviz Boost::random Boost::iostreams Boost::program_options ${zstdlib}) 30 | -------------------------------------------------------------------------------- /forest-em/README: -------------------------------------------------------------------------------- 1 | make INSTALL_PREFIX=/usr/local install -j 4 2 | forest-em --help 3 | 4 | Note: you can also read forest-em.README before compiling. 5 | -------------------------------------------------------------------------------- /forest-em/forest-em.cpp: -------------------------------------------------------------------------------- 1 | #define GRAEHL__SINGLE_MAIN 2 | #ifdef DEBUG 3 | //# define TEST_ADD_ONE_LIMIT 4 | # endif 5 | #include "forest-em-params.hpp" 6 | //#define SINGLE_PRECISION 7 | //#define HINT_SWAPBATCH_BASE 8 | #include 9 | #include //auto_ptr 10 | #include 11 | #ifndef GRAEHL_TEST 12 | 13 | using namespace boost; 14 | using namespace std; 15 | using namespace boost::program_options; 16 | using namespace graehl; 17 | 18 | 19 | 20 | //#define FOREST_EM_VERSION_STR(type,size) "sizeof(" #type ")=" FOREST_EM_STRINGIZE(size) 21 | //#define FOREST_EM_VERSION_SIZE(name,type) FOREST_EM_VERSION_STR(name,sizeof(type)) 22 | //#define FOREST_EM_SIZE_COUNT sizeof(forest::count_t) 23 | //#define FOREST_EM_VERSION_STRING FOREST_EM_VERSION "-" FOREST_EM_VERSION_STR(count,FOREST_EM_SIZE_COUNT) 24 | //FOREST_EM_VERSION_SIZE(prob,forest::prob_t) 25 | 26 | MAIN_BEGIN 27 | { 28 | DBP_INC_VERBOSE; 29 | #ifdef DEBUG 30 | DBP::set_logstream(&cerr); 31 | #endif 32 | //DBP_OFF; 33 | 34 | return forest_em_param.main(argc,argv); 35 | 36 | } 37 | MAIN_END 38 | 39 | #endif 40 | 41 | -------------------------------------------------------------------------------- /forest-em/sample/.gitignore: -------------------------------------------------------------------------------- 1 | forests.dot 2 | forests.b 3 | -------------------------------------------------------------------------------- /forest-em/sample/Makefile: -------------------------------------------------------------------------------- 1 | default: 2 | cd ../ && make 3 | -------------------------------------------------------------------------------- /forest-em/sample/best_forest: -------------------------------------------------------------------------------- 1 | (1 2 | (OR (OR 3 | (1 (OR #1(2 (OR 1 3)) (OR 3 4) (OR (1 (2 3)) 4)) (OR 1 2 (3 2) (2 (OR #1 4)) (1 #1 #1))) 4 | (4 4 (OR 1 1 2) (OR 4 4) (OR 1 4) 4) 5 | 5 6 | ) 7 | )) 8 | -------------------------------------------------------------------------------- /forest-em/sample/best_norm: -------------------------------------------------------------------------------- 1 | ((1 2 3 4 5)) 2 | -------------------------------------------------------------------------------- /forest-em/sample/best_weights: -------------------------------------------------------------------------------- 1 | e^-4 2 | e^-2 3 | e^-3 4 | e^-6 5 | e^-100 6 | -------------------------------------------------------------------------------- /forest-em/sample/byid_rules: -------------------------------------------------------------------------------- 1 | rule3 a id=3 2 | rule4 a id=4 3 | rule1 a id=1 4 | rule2 a id=2 5 | rule5 a id=5 6 | rule9 a id=9 7 | rule10 a id=10 8 | rule6 a id=6 9 | rule7 a id=7 10 | rule8 a id=8 11 | rule11 a id=11 12 | rule12 a id=12 13 | rule13 a id=13 14 | rule14 a id=14 15 | rule15 a id=15 16 | rule16 a id=16 17 | -------------------------------------------------------------------------------- /forest-em/sample/derivs/first10.norm: -------------------------------------------------------------------------------- 1 | ((85 139) (72) (52) (35) (31) (44) (28) (118) (45) (188) (36 166) (168) (34) (187) (185) (16 65) (53) (146) (200) (198 79) (29) (92) (46) (62) (147) (107) (12) (184) (207) (43 156 25 22) (138) (152) (154) (83) (95) (143) (100) (150 76) (61) (78) (50) (159) (177) (179) (80) (54) (123) (201) (178) (206 102 174 74) (18) (158) (153) (167 191) (119) (182) (77) (14) (30) (99) (64) (157 208) (113) (58) (106) (109 40 94 104 145 144) (171 204 41) (49) (11) (86) (128 170) (10) (5) (195) (169) (149) (141) (59) (88 38 162) (90) (133) (56) (24) (122) (210) (148) (60 165) (211 175) (155) (26) (9) (68 135) (91) (172) (120) (110) (23 181) (131) (81 1) (112) (97) (2) (183 3) (75) (129) (116) (98) (189) (161) (124) (67) (136) (57) (202) (140) (203) (194) (121) (48) (173) (70 103) (125) (164) (55) (82) (51) (37) (193) (33) (4) (66) (197) (142) (96) (27) (126) (73) (87) (176) (163) (209) (130) (7) (20) (132) (13) (93) (17 205) (101) (19) (32) (6) (180) (115) (134) (71 111) (137) (47) (186 39) (42) (199) (8) (108) (160 192) (117) (69) (105) (127) (84) (151 15 114 190) (21) (196) (89) (63)) 2 | -------------------------------------------------------------------------------- /forest-em/sample/derivs/first10.rules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/forest-em/sample/derivs/first10.rules -------------------------------------------------------------------------------- /forest-em/sample/derivs/first100.rules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/forest-em/sample/derivs/first100.rules -------------------------------------------------------------------------------- /forest-em/sample/derivs/first1000.rules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/forest-em/sample/derivs/first1000.rules -------------------------------------------------------------------------------- /forest-em/sample/derivs/first10000.rules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/forest-em/sample/derivs/first10000.rules -------------------------------------------------------------------------------- /forest-em/sample/forest: -------------------------------------------------------------------------------- 1 | (OR 2 | #1(1 #2(2) 3 #2) (4 #4(5) #2) (6 #2 #4) (7 8) 3 | (9 #5(OR (10 (11 12)) #6(13 14)) (15 #1 #2) (16 #6)) 4 | ) 5 | -------------------------------------------------------------------------------- /forest-em/sample/forests: -------------------------------------------------------------------------------- 1 | (OR 2 | #1(1 #2(2) 3 #2) (4 #4(5) #2) (6 #2 #4) (7 8) 3 | (9 #5(OR (10 (11 12)) #6(13 14)) (15 #1 #2) (16 #6)) 4 | ) 5 | (1 4) 6 | (OR (1 4) (1 3)) 7 | (OR (1 4 4) (2 3 4) (2 4 3) (1 5)) 8 | (OR #1(1 #2(OR 2 3) #2) (4 #4(OR #2 5) #2) (6 #2 #4) (7 8)) 9 | -------------------------------------------------------------------------------- /forest-em/sample/forests.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/forest-em/sample/forests.gz -------------------------------------------------------------------------------- /forest-em/sample/ints: -------------------------------------------------------------------------------- 1 | 1 2 3 2 | 1 3 | 1 1 1 4 | 2 2 5 | 5 6 | 7 | 9 8 | 7 9 | 12 10 | 2 2 11 | 5 12 | 13 | 9 14 | 7 15 | 12 16 | 1 20 17 | 17 18 | 16 19 | 16 20 | 9 9 9 21 | 9 9 9 22 | 9 9 9 23 | 9 9 9 24 | 9 9 9 25 | 11 11 11 26 | 11 11 11 27 | 11 11 11 28 | 11 11 11 29 | 11 11 11 30 | 11 11 11 31 | 14 14 14 32 | 14 14 14 33 | 14 14 14 34 | 15 15 15 15 35 | 15 15 15 15 36 | 15 15 15 15 37 | 15 15 15 15 38 | 5 5 5 5 5 5 5 39 | 5 5 5 5 5 5 5 40 | 5 5 5 5 5 5 5 41 | 5 5 5 5 5 5 5 42 | 5 5 5 5 5 5 5 43 | 5 5 5 5 5 5 5 44 | 19 20 21 22 45 | 19 20 21 22 46 | 19 20 21 22 47 | 19 20 21 22 48 | 19 20 21 22 49 | 19 20 21 22 50 | 19 20 21 22 51 | 19 20 21 22 52 | 5 10 10 10 10 10 10 10 10 10 10 10 10 10 53 | 54 | 5 10 10 10 10 10 10 10 10 10 10 10 10 10 55 | 56 | 5 13 13 13 13 13 13 13 13 13 13 13 57 | 58 | 5 14 14 14 14 14 14 14 14 14 14 14 59 | 60 | 5 14 14 13 13 13 13 13 13 13 13 13 61 | -------------------------------------------------------------------------------- /forest-em/sample/norm: -------------------------------------------------------------------------------- 1 | ((1 2 7 ) (3 4 5 6)) 2 | -------------------------------------------------------------------------------- /forest-em/sample/norm_and_forests: -------------------------------------------------------------------------------- 1 | ((1 2 7 ) (3 4 5 6)) 2 | (1 4) 3 | (OR (1 4) (1 3)) 4 | (OR (1 4 4) (2 3 4) (2 4 3) (1 5)) 5 | (OR #1(1 #2(4) #2) (2 #4(3) #2) (2 #2 #4) (1 5)) 6 | 7 | -------------------------------------------------------------------------------- /forest-em/sample/raw_weight_array: -------------------------------------------------------------------------------- 1 | e^2 2 | 1 3 | 0 4 | .5 5 | 10 6 | 2 7 | 3 8 | 4 9 | 5 10 | e^5 11 | e^-1e+30 12 | e^1 13 | -------------------------------------------------------------------------------- /forest-em/sample/rule_list: -------------------------------------------------------------------------------- 1 | rule1 a id=1 2 | rule2 a id=2 3 | rule3 a id=3 4 | rule4 a id=4 5 | rule5 a id=5 6 | rule6 a id=6 7 | rule7 a id=7 8 | rule8 a id=8 9 | rule9 a id=9 10 | rule10 a id=10 11 | rule11 a id=11 12 | rule12 a id=12 13 | rule13 a id=13 14 | rule14 a id=14 15 | rule15 a id=15 16 | rule16 a id=16 17 | -------------------------------------------------------------------------------- /forest-em/sample/testderivs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | d=`dirname $0` 3 | F=$d/../bin/$ARCH/forest-em 4 | dd=$d/derivs 5 | w=$1 6 | shift 7 | normsuffix=${1:-norm} 8 | shift 9 | norm=$dd/$w.$normsuffix 10 | deriv=$dd/$w.deriv 11 | rules=$dd/$w.rules 12 | out=train.$w.$normsuffix.out 13 | log=train.$w.$normsuffix.log 14 | watchrule=`grep -n '^S(x0:NP-C x1:VP)' $rules | head -1 | cut -d: -f1` 15 | echo watching rule $watchrule: 16 | head -$watchrule $rules | tail -1 17 | cm="$F -f $deriv -n $norm -o $out --rules-file $rules --watch-rule $watchrule --watch-depth 40 --watch-period 5 -M 1560 -i 200 -r 4 $*" 18 | echo $cm 19 | time $cm 2>&1 | tee $log 20 | #$F -M 500 -i 200 -f $deriv -n $norm -o $out $* 21 | -------------------------------------------------------------------------------- /forest-em/sample/tree.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/forest-em/sample/tree.gz -------------------------------------------------------------------------------- /gextract/astronauts.a: -------------------------------------------------------------------------------- 1 | 0-1 2-0 2 | 0-0 1-1 2-1 3-2 4-6 4-7 5-3 6-3 7-4 8-8 3 | 1-0 3-3 4-1 6-2 7-4 4 | -------------------------------------------------------------------------------- /gextract/astronauts.e-parse: -------------------------------------------------------------------------------- 1 | (PP (IN by) (DT the) (NN police)) 2 | (S (NP (DT These) (CD 7) (NNS people)) (VP (VBP include) (NP (NP (NNS astronauts)) (VP (VBG coming) (PP (IN from) (NP (NNP (France)))) ))) (. .)) 3 | (S (NP (DT The) (NNS gunmen)) (VP (VBD were) (VP-C (VBN killed) (PP (IN by) (NP (DT the) (NN police) )))) (. .)) 4 | -------------------------------------------------------------------------------- /gextract/astronauts.f: -------------------------------------------------------------------------------- 1 | POLICE BY 2 | THESE 7PEOPLE INCLUDE COMINGFROM FRANCE DUH ASTRO- -NAUTS PERIOD 3 | GUNMEN BY POLICE WEREKILLED . 4 | -------------------------------------------------------------------------------- /gextract/castronauts.a: -------------------------------------------------------------------------------- 1 | 0-1 0-0 2 | 0-0 0-1 2-5 4-2 4-6 4-7 5-3 6-3 7-4 8-8 3 | 1-0 3-3 6-1 7-2 7-4 4 | -------------------------------------------------------------------------------- /gextract/castronauts.a-gold: -------------------------------------------------------------------------------- 1 | 0-1 2-0 2 | 0-0 1-1 2-1 3-2 4-6 4-7 5-3 6-3 7-4 8-8 3 | 1-0 3-3 4-1 6-2 7-4 4 | -------------------------------------------------------------------------------- /gextract/castronauts.e-parse: -------------------------------------------------------------------------------- 1 | (PP (IN by) (DT the) (NN police) ) 2 | (S (NP (DT These) (CD 7) (NNS people) ) (VP (VBP include) (NP (NP (NNS astronauts) ) (VP (VBG coming) (PP (IN from) (NP (NNP France) ) ) ) ) ) (. .) ) 3 | (S (NP (DT The) (NNS gunmen) ) (VP (VBD were) (VP-C (VBN killed) (PP (IN by) (NP (DT the) (NN police) ) ) ) ) (. .) ) 4 | -------------------------------------------------------------------------------- /gextract/castronauts.f: -------------------------------------------------------------------------------- 1 | POLICE BY 2 | THESE 7PEOPLE INCLUDE COMINGFROM FRANCE DUH ASTRO- -NAUTS PERIOD 3 | GUNMEN BY POLICE WEREKILLED . 4 | -------------------------------------------------------------------------------- /gextract/etree.py: -------------------------------------------------------------------------------- 1 | ../sblm/etree.py -------------------------------------------------------------------------------- /gextract/optfunc.py: -------------------------------------------------------------------------------- 1 | optfunc/optfunc.py -------------------------------------------------------------------------------- /gextract/optfunc/__init__.py: -------------------------------------------------------------------------------- 1 | # Empty __init__.py file to make optfunc into a quick-and-dirty module 2 | -------------------------------------------------------------------------------- /gextract/radu2ptb.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | while (<>) { 4 | s/^0$//; 5 | s/\(([^~]+)~(\d+)~(\d+)\s+([-.\d]+)/($1/g; 6 | s/\((-LRB-(-\d+)?) \(\)/\($1 -LRB-\)/g; 7 | s/\((-RRB-(-\d+)?) \)\)/\($1 -RRB-\)/g; 8 | print; 9 | } 10 | -------------------------------------------------------------------------------- /gextract/reviz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export skip=1 3 | skip=1 noise=.1 until=10 every=100 iter=1000 ./do.mono.sh 4 | skip=1 noise=.1 until=10 iter=10 ./do.mono.sh 5 | [ "$first" ] && exit 6 | skip=1 noised=0 temp0=10 tempf=.5 until=3 every=10 noise=0 iter=40 ./do.mono.sh 7 | vizall=1 skip=1 noised=0 until=3 every=10 noise=0 iter=100 ./do.mono.sh 8 | skip=1 noised=4 until=3 every=20 noise=.3 iter=100 ./do.mono.sh 9 | skip=1 vizall=1 noised=0 temp0=10 tempf=.5 until=3 every=10 noise=0 iter=40 ./do.mono.sh 10 | skip=1 until=5 nomono=1 temp0=1.2 tempf=.2 iter=100 every=20 ./do.mono.sh 11 | skip=1 until=10 nomono=1 iter=200 every=20 ./do.mono.sh 12 | skip=1 noised=5 until=3 every=10 noise=.3 iter=120 ./do.mono.sh 13 | skip=1 until=5 noised=2 every=20 temp0=1 tempf=1 noise=.2 iter=160 ./do.mono.sh 14 | 15 | skip=1 iter=100 nin=1000 noise=.3 noised=5 ./do.mono.sh 16 | 17 | skip=1 noised=2 until=3 every=10 temp0=1.5 tempf=.08 noise=.2 iter=80 ./do.mono.sh 18 | 19 | -------------------------------------------------------------------------------- /graehl/shared/.gdbinit: -------------------------------------------------------------------------------- 1 | catch throw 2 | r 3 | -------------------------------------------------------------------------------- /graehl/shared/.gitignore: -------------------------------------------------------------------------------- 1 | SGT 2 | -------------------------------------------------------------------------------- /graehl/shared/Lx_norm.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL_SHARED__LX_NORM_HPP 15 | #define GRAEHL_SHARED__LX_NORM_HPP 16 | 17 | #include 18 | 19 | namespace graehl { 20 | 21 | struct sum_powx 22 | { 23 | double x; 24 | 25 | // most common is L2-norm (Euclid. distance) 26 | sum_powx(double x = 2) : x(x) {} 27 | 28 | template 29 | W operator()(W total, W component) 30 | { 31 | return total+pow(component, x); 32 | } 33 | 34 | //boost::result_of 35 | template struct result {}; 36 | template struct result { typedef W type; }; 37 | }; 38 | 39 | template 40 | typename range_value::type 41 | lx_norm(R const& range, double x = 2) 42 | { 43 | return pow(reduce(range, sum_powx(x), 0), 1./x); 44 | } 45 | 46 | 47 | 48 | } 49 | 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /graehl/shared/Makefile: -------------------------------------------------------------------------------- 1 | P=main_template 2 | 3 | all: next 4 | 5 | test: next 6 | 7 | .PHONY: test 8 | 9 | %: 10 | g++ words_per_line.cpp -I../.. -o words_per_line && ./words_per_line < words_per_line.cpp 11 | 12 | 13 | -------------------------------------------------------------------------------- /graehl/shared/SGT.counts.txt: -------------------------------------------------------------------------------- 1 | 1 120 2 | 2 40 3 | 3 24 4 | 4 13 5 | 5 15 6 | 6 5 7 | 7 11 8 | 8 2 9 | 9 2 10 | 10 1 11 | 12 3 12 | -------------------------------------------------------------------------------- /graehl/shared/_template.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef TEMPLATE_HPP 15 | #define TEMPLATE_HPP 16 | 17 | #ifdef GRAEHL_TEST 18 | #include 19 | #endif 20 | 21 | #ifdef GRAEHL_TEST 22 | BOOST_AUTO_TEST_CASE( TEST_TEMPLATE ) 23 | { 24 | } 25 | #endif 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /graehl/shared/abs_int.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL__SHARED__ABS_INT_HPP 15 | #define GRAEHL__SHARED__ABS_INT_HPP 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | namespace graehl { 23 | 24 | template 25 | inline typename boost::enable_if< typename boost::is_integral 26 | , typename boost::remove_cv::type 27 | >::type 28 | bit_rotate_right(I x) 29 | { 30 | typedef typename boost::remove_cv::type IT; 31 | return x<0?-x:x; 32 | } 33 | 34 | } 35 | 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /graehl/shared/assertlvl.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl-http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | /** \file 15 | 16 | a continuum of asserts (finer than all-off-for-release) 17 | */ 18 | 19 | 20 | #ifndef GRAEHL__SHARED__ASSERTLVL_HPP 21 | #define GRAEHL__SHARED__ASSERTLVL_HPP 22 | #pragma once 23 | 24 | #ifndef ASSERT_LEVEL 25 | #define ASSERT_LEVEL 9999 26 | #endif 27 | 28 | #define IF_ASSERT(level) if (ASSERT_LEVEL >= level) 29 | #define UNLESS_ASSERT(level) if (ASSERT_LEVEL < level) 30 | #ifndef assertlvl 31 | #include 32 | #define assertlvl(level, assertion) \ 33 | do { \ 34 | IF_ASSERT(level) { assert(assertion); } \ 35 | } while (0) 36 | #endif 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /graehl/shared/batched_append.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL__SHARED__batched_append_hpp 15 | #define GRAEHL__SHARED__batched_append_hpp 16 | 17 | #include //swap 18 | #include 19 | 20 | template 21 | void batched_append(Vector &v, SRange const& s) { 22 | std::size_t news = v.size()+s.size(); 23 | v.reserve(news); 24 | v.insert(v.end(), s.begin(), s.end()); 25 | } 26 | 27 | template 28 | void batched_append_swap(Vector &v, SRange & s) { 29 | using namespace std; // to find the right swap 30 | size_t i = v.size(); 31 | size_t news = i+s.size(); 32 | v.resize(news); 33 | typename SRange::iterator si = s.begin(); 34 | for (; i='0' && c<='9'; 24 | } 25 | inline bool isalpha(char c) { 26 | return c>='A' && c<='Z' || c>='a'&& c<='z'; 27 | } 28 | inline bool isblank(char c) { 29 | return c=='\t' || c==' '; 30 | } 31 | inline bool isspace(char c) { 32 | return c=='\n' || isblank(c); // intentionally neglecting \r \v \f 33 | } 34 | MAKE_CHARP(isdigit) 35 | MAKE_CHARP(isalpha) 36 | MAKE_CHARP(isblank) 37 | MAKE_CHARP(isspace) 38 | 39 | }//ns 40 | 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /graehl/shared/cpp11.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifndef GRAEHL_CPP11 3 | 4 | #if __cplusplus >= 201700L 5 | #define GRAEHL_CPP17 1 6 | #else 7 | #define GRAEHL_CPP17 0 8 | #endif 9 | 10 | #if __cplusplus >= 201103L || SDL_CPP11 || _MSC_VER >= 1900 11 | #define GRAEHL_CPP11 1 12 | #if __cplusplus >= 201400L 13 | #define GRAEHL_CPP14 1 14 | #define GRAEHL_CPP14_TYPETRAITS 1 15 | #else 16 | #define GRAEHL_CPP14 0 17 | #define GRAEHL_CPP14_TYPETRAITS 0 18 | #endif 19 | #else 20 | #define GRAEHL_CPP11 0 21 | #define GRAEHL_CPP14 0 22 | #define GRAEHL_CPP14_TYPETRAITS 0 23 | #endif 24 | 25 | #if GRAEHL_CPP11 26 | #define GRAEHL_CONSTEXPR constexpr 27 | #else 28 | #define GRAEHL_CONSTEXPR 29 | #endif 30 | 31 | #if _MSC_VER >= 1900 32 | #undef GRAEHL_CPP14_TYPETRAITS 33 | #define GRAEHL_CPP14_TYPETRAITS 1 34 | #endif 35 | 36 | #if __cplusplus >= 201700L 37 | // GCC 8.2 has 201709 and clang 7.0 has 201707 38 | #define GRAEHL_CPP17 1 39 | #else 40 | #define GRAEHL_CPP17 0 41 | #endif 42 | 43 | #if __cplusplus >= 202000L 44 | // GCC 8.2 has 201709 and clang 7.0 has 201707 45 | #define GRAEHL_CPP20 1 46 | #else 47 | #define GRAEHL_CPP20 0 48 | #endif 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /graehl/shared/dbg_level.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl-http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | /** \file 15 | 16 | . 17 | */ 18 | 19 | #ifndef GRAEHL_SHARED__DBG_LEVEL_HPP 20 | #define GRAEHL_SHARED__DBG_LEVEL_HPP 21 | #pragma once 22 | 23 | #include 24 | 25 | #define DECLARE_DBG_LEVEL_C(n, env) DECLARE_ENV_C_LEVEL(n, getenv_##env, env) 26 | #define DECLARE_DBG_LEVEL(ch) DECLARE_DBG_LEVEL_C(ch##_DBG_LEVEL, ch##_DBG) 27 | #define DECLARE_DBG_LEVEL_IF(ch) ch(DECLARE_DBG_LEVEL_C(ch##_DBG_LEVEL, ch##_DBG)) 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /graehl/shared/dummy.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL_SHARED__DUMMY_HPP 15 | #define GRAEHL_SHARED__DUMMY_HPP 16 | 17 | #ifdef GRAEHL_TEST 18 | #include 19 | #endif 20 | 21 | namespace graehl { 22 | 23 | template 24 | struct dummy { 25 | static const C &var(); 26 | }; 27 | 28 | 29 | template 30 | const C& dummy::var() { 31 | static C var; 32 | return var; 33 | } 34 | 35 | #ifdef GRAEHL_TEST 36 | 37 | BOOST_AUTO_TEST_CASE( TEST_dummy ) 38 | { 39 | BOOST_CHECK(dummy::var() == 0); 40 | } 41 | #endif 42 | 43 | } 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /graehl/shared/exact_cast.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL__SHARED__EXACT_CAST_HPP 15 | #define GRAEHL__SHARED__EXACT_CAST_HPP 16 | 17 | #include 18 | 19 | namespace graehl { 20 | 21 | struct inexact_cast : public std::runtime_error 22 | { 23 | inexact_cast() : std::runtime_error("inexact_cast - casting to a different type lost information") {} 24 | }; 25 | 26 | template 27 | To exact_static_assign(To &to, From const& from) 28 | { 29 | to = static_cast(from); 30 | if (static_cast(to)!=from) 31 | throw inexact_cast(); 32 | return to; 33 | } 34 | 35 | template 36 | To exact_static_cast(From const& from) 37 | { 38 | To to; 39 | exact_static_assign(to, from); 40 | return to; 41 | } 42 | 43 | 44 | }//graehl 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /graehl/shared/example.Makefile: -------------------------------------------------------------------------------- 1 | #gdb --args /cache/tt/bin/cygwin/forest-em.debug -f /cache/tt/sample/best_forest -I /cache/tt/sample/best_weights -n /cache/tt/sample/best_norm -i 1 -m 100k -w 3 -x sample/best_viterbi -L 9 2 | 3 | PROGS= count-id-freq add-giza-models 4 | #PROGS+=text-to-cc 5 | 6 | 7 | count-id-freq_OBJ=count-id-freq.o 8 | count-id-freq_SLIB=$(BOOST_OPT_LIB) 9 | count-id-freq_NOTEST=1 10 | #count-id-freq_NOSTATIC=1 11 | #count-id-freq_NODEBUG=1 12 | 13 | add-giza-models_OBJ=add-giza-models.o 14 | add-giza-models_SLIB=$(BOOST_OPT_LIB) 15 | add-giza-models_NOTEST=1 16 | 17 | 18 | SHARED=../shared 19 | INC= . $(SHARED) 20 | LIB= 21 | CXX:=g++ 22 | 23 | BASECXXFLAGS= -ggdb -ffast-math 24 | CXXFLAGS= $(BASECXXFLAGS) -O -DNO_BACKTRACE -DUSE_NONDET_RANDOM 25 | #-DSINGLE_PRECISION 26 | ## would have to link with boost random nondet source 27 | 28 | CPPFLAGS_DEBUG+= -DDEBUG 29 | CXXFLAGS_DEBUG= $(BASECXXFLAGS) 30 | # -DDEBUGFIXEDINPUT 31 | CPPFLAGS_TEST+= -DTEST -DDEBUG 32 | CXXFLAGS_TEST=$(BASECXXFLAGS) 33 | #CPP_EXT=cpp 34 | ALL_CLEAN += *.restart.* *.swap.* *.stackdump *.d *.out *.log massif.* core 35 | 36 | default: all 37 | #forest-em-debug 38 | #mydefault 39 | 40 | vpath %.cpp .:$(SHARED) 41 | 42 | include ../shared/graehl.mk 43 | 44 | 45 | mydefault: $(BIN)/count-id-freq.debug 46 | -------------------------------------------------------------------------------- /graehl/shared/fast_lexical_cast.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef FAST_LEXICAL_CAST_HPP 15 | #define FAST_LEXICAL_CAST_HPP 16 | 17 | #define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE 18 | 19 | #include 20 | 21 | using boost::lexical_cast; 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /graehl/shared/force_link.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl-http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | /** \file 15 | 16 | avoid elimination of dead symbols while static linking. 17 | */ 18 | 19 | #ifndef FORCE_LINK_JG_2015_03_23_HPP 20 | #define FORCE_LINK_JG_2015_03_23_HPP 21 | #pragma once 22 | 23 | #include 24 | 25 | namespace graehl { 26 | 27 | static void force_link(void* p) { 28 | static volatile std::size_t forced_link; 29 | forced_link ^= (std::size_t)p; 30 | } 31 | 32 | template 33 | static void force_link_class() { 34 | static C f; 35 | force_link(&f); 36 | } 37 | 38 | #define GRAEHL_FORCE_LINK_CLASS(x) graehl::force_link_class(); 39 | 40 | 41 | } 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /graehl/shared/format.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl-http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL__SHARED__FORMAT_HPP 15 | #define GRAEHL__SHARED__FORMAT_HPP 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | namespace fm { 22 | using std::string; 23 | using '\n'; 24 | using std::flush; 25 | using boost::format; 26 | using boost::io::group; 27 | using boost::io::str; 28 | using std::setfill; 29 | using std::setw; 30 | using std::hex; 31 | using std::dec; 32 | using std::showbase; 33 | using std::left; 34 | using std::right; 35 | using std::internal; 36 | } 37 | 38 | #define FSTR(x, y) fm::str(fm::format(x) % y) 39 | 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /graehl/shared/ftoa_append.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/graehl/shared/ftoa_append.hpp -------------------------------------------------------------------------------- /graehl/shared/glibc_memcpy.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /// http://www.win.tue.nl/~aeb/linux/misc/gcc-semibug.html 3 | #if !defined(__APPLE__) && defined(__linux__) && defined(__GNUC__) && defined(__LP64__) \ 4 | && !defined(USE_LATEST_MEMCPY) /* only under 64 bit gcc */ 5 | __asm__(".symver memcpy,memcpy@GLIBC_2.2.5"); 6 | #endif 7 | -------------------------------------------------------------------------------- /graehl/shared/have_64_bits.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl-http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | /** \file 15 | 16 | detect: pointer and size_t are 64 bits 17 | */ 18 | 19 | #ifndef GRAEHL_SHARED__HAVE_64_BITS_HPP 20 | #define GRAEHL_SHARED__HAVE_64_BITS_HPP 21 | #pragma once 22 | 23 | #ifndef HAVE_64_BITS 24 | 25 | // Check windows 26 | #if defined(_WIN32) || defined(_WIN64) 27 | #if defined(_WIN64) 28 | #define HAVE_64_BITS 1 29 | #else 30 | #define HAVE_64_BITS 0 31 | #endif 32 | #elif __x86_64__ || __ppc64__ 33 | #define HAVE_64_BITS 1 34 | #else 35 | #define HAVE_64_BITS 0 36 | #endif 37 | 38 | #endif // HAVE_64_BITS 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /graehl/shared/identity.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL_SHARED__IDENTITY_HPP 15 | #define GRAEHL_SHARED__IDENTITY_HPP 16 | 17 | namespace graehl { 18 | 19 | template 20 | struct identity 21 | { 22 | typedef V argument_type; 23 | typedef V result_type; 24 | result_type operator()(argument_type a) const { return a; } 25 | }; 26 | 27 | template 28 | struct identity_ref 29 | { 30 | typedef V argument_type; 31 | typedef V result_type; 32 | result_type const& operator()(argument_type const& a) const { return a; } 33 | result_type & operator()(argument_type & a) const { return a; } 34 | }; 35 | 36 | // should be safe as identity 37 | template 38 | struct identity_cref 39 | { 40 | typedef V const& argument_type; 41 | typedef V const& result_type; 42 | result_type operator()(argument_type a) const { return a; } 43 | }; 44 | 45 | 46 | } 47 | 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /graehl/shared/inline.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl-http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | /** \file 15 | 16 | force inlining on or off: 17 | 18 | ALWAYS_INLINE void f() { g(); } 19 | NEVER_INLINE void f2() { g(); } 20 | */ 21 | 22 | #ifndef INLINE_JG_2014_11_12_HPP 23 | #define INLINE_JG_2014_11_12_HPP 24 | #pragma once 25 | 26 | #ifndef ALWAYS_INLINE 27 | #if defined(__GNUC__) || defined(__clang__) 28 | #define ALWAYS_INLINE inline __attribute__((__always_inline__)) 29 | #elif defined(_MSC_VER) 30 | #define ALWAYS_INLINE __forceinline 31 | #else 32 | #define ALWAYS_INLINE inline 33 | #endif 34 | #endif 35 | 36 | #ifndef NEVER_INLINE 37 | #if defined(__GNUC__) || defined(__clang__) 38 | #define NEVER_INLINE __attribute__((__noinline__)) 39 | #elif defined(_MSC_VER) 40 | #define NEVER_INLINE __declspec(noinline) 41 | #else 42 | #define NEVER_INLINE 43 | #endif 44 | #endif 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /graehl/shared/karma_tostr.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef KARMA_GENERATE_HPP 15 | #define KARMA_GENERATE_HPP 16 | 17 | #include 18 | 19 | namespace karma = boost::spirit::karma; 20 | 21 | template 22 | bool tostr(std::string& str, T const& value) 23 | { 24 | std::back_insert_iterator sink(str); 25 | return karma::generate(sink, value); 26 | } 27 | 28 | template 29 | std::string tostr(T const& value) 30 | { 31 | string str; 32 | std::back_insert_iterator sink(str); 33 | karma::generate(sink, value); 34 | return str; 35 | } 36 | 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /graehl/shared/lc_ascii.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl-http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | /** \file 15 | 16 | case-insensitive string keys 17 | */ 18 | 19 | #ifndef GRAEHL_SHARED__LC_ASCII_HPP 20 | #define GRAEHL_SHARED__LC_ASCII_HPP 21 | #pragma once 22 | 23 | 24 | namespace graehl { 25 | 26 | inline char lc_ascii(char c) { 27 | if (c >= 'A' && c <= 'Z') c -= ('A' - 'a'); 28 | return c; 29 | } 30 | 31 | template 32 | String& lc_ascii_inplace(String& s) { 33 | for (typename String::iterator i = s.begin(), e = s.end(); i != e; ++i) *i = lc_ascii(*i); 34 | return s; 35 | } 36 | 37 | template 38 | void append_lc_ascii(String& r, char const* s) { 39 | while (*s) r.push_back(lc_ascii(*s++)); 40 | } 41 | 42 | template 43 | void set_lc_ascii(String& r, char const* s) { 44 | r.clear(); 45 | append_lc_ascii(r, s); 46 | } 47 | 48 | 49 | } 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /graehl/shared/lerp.hpp: -------------------------------------------------------------------------------- 1 | /** \file 2 | 3 | fused multiply-and-add optimized linear interpolation 4 | 5 | https://en.wikipedia.org/wiki/FMA_instruction_set 6 | 7 | want compiler to enable FMA3 (3 arg) not FMA4 (4 arg) 8 | */ 9 | 10 | #ifndef LERP_JG_2015_06_17_HPP 11 | #define LERP_JG_2015_06_17_HPP 12 | #pragma once 13 | 14 | namespace graehl { 15 | 16 | template 17 | T fma(T a, T b, T c) { 18 | return a * b + c; 19 | } 20 | 21 | /// \return ta*a + (1-ta)*b, optimized for fma 22 | template 23 | T lerp(T a, T b, T ta) { 24 | return fma(t, v1, fma(-t, v0, v0)); 25 | } 26 | 27 | 28 | } 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /graehl/shared/likely.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl-http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | /** \file 15 | 16 | branch prediction annotations. note: gcc already has some heuristics that guess 17 | ok. add predictions only if you're sure or you benchmarked. 18 | 19 | usage: if (likely(a>b)) ; // meaning you expect a>b to be true. 20 | */ 21 | 22 | #ifndef LIKELY_GRAEHL_2015_10_21_HPP 23 | #define LIKELY_GRAEHL_2015_10_21_HPP 24 | #pragma once 25 | 26 | /// standard-ish from linux kernel code but with a safe(ish) longer name: 27 | /// usage: if (likely_true(a>b)) ... 28 | /// meaning you /// expect a>b to be true. 29 | #ifdef _MSC_VER 30 | #define likely_true(x) (x) 31 | #define likely_false(x) (x) 32 | #else 33 | #define likely_true(x) __builtin_expect(!!(x), 1) 34 | #define likely_false(x) __builtin_expect(!!(x), 0) 35 | #endif 36 | #endif 37 | -------------------------------------------------------------------------------- /graehl/shared/lz4.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl-http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL__SHARED__LZ4_H 15 | #define GRAEHL__SHARED__LZ4_H 16 | #pragma once 17 | 18 | #ifndef LZ4__INLINE 19 | #if defined(GRAEHL__SINGLE_MAIN) 20 | #define LZ4__INLINE 1 21 | #else 22 | #define LZ4__INLINE 0 23 | #endif 24 | #endif 25 | 26 | namespace lz4 { 27 | #include "lz4.c" 28 | #include "lz4.h" 29 | 30 | 31 | } 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /graehl/shared/maybe_update_bound.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL__SHARED__MAYBE_UPDATE_BOUND_HPP 15 | #define GRAEHL__SHARED__MAYBE_UPDATE_BOUND_HPP 16 | 17 | namespace graehl { 18 | 19 | // see also associative container versions in assoc_container.hpp 20 | 21 | template 22 | inline void maybe_increase_max(To &to, const From &from) { 23 | if (to 28 | inline void maybe_decrease_min(To &to, const From &from) { 29 | if (from 18 | #include 19 | 20 | namespace graehl { 21 | 22 | struct mean_field_scale 23 | { 24 | bool linear; // if linear, then don't use alpha. otherwise convert to exp(digamma(alpha+x)) 25 | double alpha; 26 | 27 | // returns exp(digamma(x)) 28 | template 29 | logweight operator()(logweight const& x) const 30 | { 31 | if (linear) 32 | return x; 33 | double r = x.getReal(); 34 | if (x < .0001) // until we can compute digamma in logspace, this will be the answer. and, can't ask digamma(0), because it's negative inf. but exp(-inf)=0 35 | return 0; 36 | logweight ret; 37 | ret.setLn(digamma(alpha+r)); 38 | } 39 | }; 40 | 41 | } 42 | 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /graehl/shared/must_eof.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL_SHARED__MUST_EOF_HPP 15 | #define GRAEHL_SHARED__MUST_EOF_HPP 16 | 17 | #include 18 | #include 19 | 20 | namespace graehl { 21 | 22 | template 23 | inline void must_eof(I &in, char const* msg="Expected end of input, but got: ") 24 | { 25 | char c; 26 | if (in >> c) 27 | throw std::runtime_error(msg+std::string(1, c)); 28 | } 29 | 30 | } 31 | 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /graehl/shared/new_shared.hpp: -------------------------------------------------------------------------------- 1 | /** \file 2 | 3 | make_shared but return a new pointer to shared_ptr that must be deleted, i.e. 4 | instead of new shared_ptr(new T(...)), new_shared(...) 5 | */ 6 | 7 | #ifndef NEW_SHARED_GRAEHL_HPP 8 | #define NEW_SHARED_GRAEHL_HPP 9 | #pragma once 10 | 11 | #include 12 | #include 13 | 14 | namespace graehl { 15 | 16 | /// equivalent to new shared_ptr(new T(args...)), new_shared(args...) but with make_shared allocation benefit 17 | /// perhaps C++2x will also allow the shared_ptr to be singly allocated contiguous to its implementation 18 | template 19 | std::shared_ptr *new_shared(A&&... args) { 20 | return new std::shared_ptr(std::make_shared(std::forward(args)...)); 21 | } 22 | 23 | } // namespace graehl 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /graehl/shared/no_locking.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl-http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL_SHARED__NO_LOCKING_HPP 15 | #define GRAEHL_SHARED__NO_LOCKING_HPP 16 | #pragma once 17 | 18 | #include 19 | 20 | /* 21 | 22 | intent: 23 | 24 | template // or graehl::locking 25 | struct collection : private Locking 26 | { 27 | void some_operation() 28 | { 29 | typename Locking::lock(*this); 30 | // or bool do_lock=...; 31 | // typename Locking::scoped_lock(*this, do_lock); 32 | // (locks if do_lock) 33 | } 34 | }; 35 | */ 36 | 37 | namespace graehl { 38 | 39 | struct no_locking { 40 | typedef no_locking self_type; 41 | typedef no_locking mutex_type; 42 | struct guard_type { 43 | guard_type(self_type const& l) {} 44 | guard_type(self_type const& l, bool b) {} 45 | }; 46 | }; 47 | 48 | 49 | } 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /graehl/shared/nondet_random.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/graehl/shared/nondet_random.cpp -------------------------------------------------------------------------------- /graehl/shared/noreturn.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl-http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | /** \file 15 | 16 | avoid no-return-value compiler warnings for infinite loops and throw that 17 | will never return from a fn 18 | 19 | usage: void f() NORETURN 20 | 21 | c++11 alternative: void f() [[noreturn]] didn't work for me 22 | 23 | perhaps you can put NORETURN before or after the decl; after works 24 | */ 25 | 26 | #ifndef NORETURN_JG2012613_HPP 27 | #define NORETURN_JG2012613_HPP 28 | #pragma once 29 | 30 | #if defined(__GNUC__) && __GNUC__ >= 3 || defined(__clang__) 31 | #define NORETURN __attribute__((noreturn)) 32 | #else 33 | #define NORETURN 34 | #endif 35 | 36 | #if defined(__clang__) 37 | #define ANALYZER_NORETURN _attribute__((analyzer_noreturn)) 38 | #else 39 | #define ANALYZER_NORETURN 40 | #endif 41 | 42 | #if defined(_MSC_VER) 43 | #define NORETURNPRE __declspec(noreturn) 44 | #else 45 | #define NORETURNPRE 46 | #endif 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /graehl/shared/null_deleter.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl-http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | /** \file 15 | 16 | shared_ptr helper for nondeleting references (where the refcount is 17 | meaningless, but you pay for it anyway to simplify - everything can be a 18 | shared_ptr) 19 | */ 20 | 21 | #ifndef GRAEHL__SHARED__NULL_DELETER_HPP 22 | #define GRAEHL__SHARED__NULL_DELETER_HPP 23 | #pragma once 24 | 25 | namespace graehl { 26 | 27 | struct null_deleter { 28 | template 29 | void operator()(T const*) const {} 30 | void operator()(void const*) const {} 31 | }; 32 | 33 | template 34 | std::shared_ptr no_delete(V& v) { 35 | return std::shared_ptr(&v, null_deleter()); 36 | } 37 | 38 | template 39 | std::shared_ptr no_delete(V* v) { 40 | return std::shared_ptr(v, null_deleter()); 41 | } 42 | 43 | 44 | } 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /graehl/shared/null_output_iterator.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL_SHARED__NULL_OUTPUT_ITERATOR_HPP 15 | #define GRAEHL_SHARED__NULL_OUTPUT_ITERATOR_HPP 16 | 17 | #include 18 | 19 | namespace graehl { 20 | 21 | struct null_output_iterator { 22 | typedef std::output_iterator_tag iteratory_category; 23 | typedef void value_type; 24 | typedef void difference_type; 25 | typedef void pointer; 26 | typedef void reference; 27 | template 28 | void operator = (V const& v) const {} 29 | null_output_iterator const& operator*() const { return *this; } 30 | void operator++() const {} 31 | void operator++(int) const {} 32 | }; 33 | 34 | } 35 | 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /graehl/shared/os_memory.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graehl/carmel/ff27a1497b28dfdcc7f785455bf9bd0c18c07681/graehl/shared/os_memory.hpp -------------------------------------------------------------------------------- /graehl/shared/podcpy.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL_SHARED__PODCPY_HPP 15 | #define GRAEHL_SHARED__PODCPY_HPP 16 | 17 | #include 18 | 19 | namespace graehl { 20 | 21 | template inline 22 | void podset(P& dst, unsigned char c = 0) 23 | { 24 | std::memset((void*)&dst, c, sizeof(dst)); 25 | } 26 | 27 | template inline 28 | void podzero(P& dst) 29 | { 30 | std::memset((void*)&dst, 0, sizeof(dst)); 31 | } 32 | 33 | template inline 34 | P &podcpy(P& dst, P const& src) 35 | { 36 | std::memcpy((void*)&dst, (void*)&src, sizeof(dst)); 37 | } 38 | 39 | } 40 | 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /graehl/shared/prefix_option.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl-http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | /** \file for boost program options with opt="long-name,l" "prefix-" => 15 | "prefix-long-name" - short option is stripped to avoid conflict */ 16 | 17 | #ifndef GRAEHL_SHARED__PREFIX_OPTION_HPP 18 | #define GRAEHL_SHARED__PREFIX_OPTION_HPP 19 | #pragma once 20 | 21 | 22 | #include 23 | 24 | namespace graehl { 25 | 26 | inline std::string prefix_option(std::string opt, std::string const& prefix = "") { 27 | if (prefix.empty()) return opt; 28 | std::string::size_type nopt = opt.size(); 29 | if (nopt > 2 && opt[nopt - 2] == ',') opt.resize(nopt - 2); 30 | return prefix + opt; 31 | } 32 | 33 | inline std::string suffix_option(std::string opt, std::string const& suffix = "") { 34 | if (suffix.empty()) return opt; 35 | std::string::size_type nopt = opt.size(); 36 | if (nopt > 2 && opt[nopt - 2] == ',') opt.resize(nopt - 2); 37 | return opt + suffix; 38 | } 39 | 40 | 41 | } 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /graehl/shared/printlines.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL_SHARED__PRINTLINES_HPP 15 | #define GRAEHL_SHARED__PRINTLINES_HPP 16 | 17 | namespace graehl { 18 | 19 | template 20 | void printlines(O &o, I i, I end, const char *endl) 21 | { 22 | for (; i!=end; ++i) 23 | o << *i << endl; 24 | } 25 | 26 | template 27 | void printlines(O &o, I const& i, const char *endl="\n") 28 | { 29 | printlines(o, i.begin(), i.end(), endl); 30 | } 31 | 32 | } 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /graehl/shared/program_options_config_example.txt: -------------------------------------------------------------------------------- 1 | log-file=/tmp/log.example.txt 2 | -------------------------------------------------------------------------------- /graehl/shared/sample/sample.graph: -------------------------------------------------------------------------------- 1 | 6 2 | (0 5 1) 3 | (5 1 5) (5 2 10) (1 2 9) (3 4 2) (4 2 3) (5 3 3) 4 | (5 4 8) (2 4 100) (3 2 6) 5 | (0 0 20) 6 | -------------------------------------------------------------------------------- /graehl/shared/sample/sample.lattice: -------------------------------------------------------------------------------- 1 | 5 2 | (0 1 2) 3 | (0 2 4) 4 | (1 3 6) 5 | (1 2 3) 6 | (2 4 5) 7 | (2 3 3) 8 | (3 4 1) 9 | -------------------------------------------------------------------------------- /graehl/shared/sample/sample.lattice.carmel: -------------------------------------------------------------------------------- 1 | 5 2 | (0 1 2) 3 | (0 2 4) 4 | (1 3 6) 5 | (1 2 3) 6 | (2 4 5) 7 | (2 3 3) 8 | (3 4 1) 9 | -------------------------------------------------------------------------------- /graehl/shared/sample/simple.cycle.graph: -------------------------------------------------------------------------------- 1 | 2 2 | (0 1 1) 3 | (0 0 20) 4 | (1 0 5) 5 | -------------------------------------------------------------------------------- /graehl/shared/semiring.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // unfinished, unused 15 | #ifndef SEMIRING_HPP 16 | #define SEMIRING_HPP 17 | 18 | 19 | /*template 20 | struct semiring_traits { 21 | typedef C value_type; 22 | static inline value_type exponential(double exponent) { 23 | return exponential(exponent); 24 | } 25 | static inline value_type exponential(float exponent) { 26 | return exponential(exponent); 27 | } 28 | }; 29 | */ 30 | 31 | #include 32 | 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /graehl/shared/set_difference.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL_SHARED__SET_DIFFERENCE_HPP 15 | #define GRAEHL_SHARED__SET_DIFFERENCE_HPP 16 | 17 | #include 18 | 19 | namespace graehl { 20 | 21 | template 22 | struct set_difference : public std::set 23 | { 24 | void add(K const& k) 25 | { 26 | this->insert(k); 27 | } 28 | bool subtract(K const& k) 29 | { 30 | return this->erase(k); 31 | } 32 | }; 33 | 34 | 35 | } 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /graehl/shared/stacktrace.hpp: -------------------------------------------------------------------------------- 1 | /** \file 2 | 3 | like 'backtrace' in gdb 4 | 5 | (linux only so far) 6 | */ 7 | 8 | #ifndef STACKTRACE_GRAEHL_2016_08_29_HPP 9 | #define STACKTRACE_GRAEHL_2016_08_29_HPP 10 | #pragma once 11 | 12 | #include 13 | 14 | #ifdef __linux__ 15 | #include 16 | #include 17 | #endif 18 | 19 | namespace graehl { 20 | 21 | static const int MAX_TRACE_DEPTH = 255; 22 | 23 | inline void stacktrace(std::ostream& o = std::cerr) { 24 | #ifdef __linux__ 25 | void* trace[MAX_TRACE_DEPTH]; 26 | int trace_size = ::backtrace(trace, MAX_TRACE_DEPTH); 27 | char** messages = ::backtrace_symbols(trace, trace_size); 28 | o << "\n!!Stack backtrace:\n"; 29 | for (int i = 0; i < trace_size; ++i) o << "!! " << messages[i] << '\n'; 30 | #endif 31 | } 32 | 33 | 34 | } 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /graehl/shared/static_fgets_buf.h: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | /** \file 15 | 16 | . 17 | */ 18 | 19 | #ifndef STATIC_FGETS_BUF_JG_2014_12_31_H 20 | #define STATIC_FGETS_BUF_JG_2014_12_31_H 21 | #pragma once 22 | 23 | #ifndef READ_BUFSIZE 24 | #define READ_BUFSIZE (8 * 1024 * 1024) 25 | #endif 26 | 27 | #ifndef FGETS_UNLOCKED 28 | #if _GNU_SOURCE 29 | #define FGETS_UNLOCKED fgets_unlocked 30 | #else 31 | #define FGETS_UNLOCKED fgets 32 | #endif 33 | #endif 34 | 35 | static char buf[READ_BUFSIZE], bufstdio[READ_BUFSIZE]; 36 | #ifndef FALSE_SHARING_PROTECT 37 | #define FALSE_SHARING_PROTECT 72 38 | #endif 39 | 40 | static inline void set_static_bufstdio(FILE *fp) { 41 | setvbuf(fp, bufstdio + FALSE_SHARING_PROTECT, _IOFBF, READ_BUFSIZE - FALSE_SHARING_PROTECT); 42 | } 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /graehl/shared/static_itoa.h: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL_SHARED__STATIC_ITOA_H 15 | #define GRAEHL_SHARED__STATIC_ITOA_H 16 | 17 | #include 18 | #include 19 | 20 | 21 | namespace graehl { 22 | 23 | namespace { 24 | static const int utoa_bufsize = 40; // 64bit safe. 25 | static const int utoa_bufsizem1 = utoa_bufsize-1; // 64bit safe. 26 | THREADLOCAL char utoa_buf[utoa_bufsize]; // note: 0 initialized 27 | } 28 | 29 | inline char *static_utoa(unsigned n) { 30 | assert(utoa_buf[utoa_bufsizem1]==0); 31 | return utoa(utoa_buf+utoa_bufsizem1, n); 32 | } 33 | 34 | inline char *static_itoa(int n) { 35 | assert(utoa_buf[utoa_bufsizem1]==0); 36 | return itoa(utoa_buf+utoa_bufsizem1, n); 37 | } 38 | 39 | }//graehl 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /graehl/shared/string.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // string: as opposed to a tree. 15 | #ifndef STRING_HPP 16 | #define STRING_HPP 17 | 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | //#include 25 | #include 26 | #include 27 | #include 28 | 29 | #include 30 | 31 | namespace graehl { 32 | 33 | template > struct String : public array { 34 | typedef L Label; 35 | }; 36 | 37 | } 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /graehl/shared/string_tr.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef STRING_TR_HPP 15 | #define STRING_TR_HPP 16 | 17 | 18 | namespace graehl { 19 | 20 | // [] to {} 21 | 22 | template 23 | void write_tr(O &o,S const& s,F map) { 24 | for (typename S::const_iterator i=s.begin(),e=s.end();i!=e;++i) 25 | o< 29 | S tr(S const& s,F map) { 30 | S r(s); 31 | for (typename S::iterator i=s.begin(),e=s.end();i!=e;++i) 32 | *i=map(*i); 33 | return r; 34 | } 35 | 36 | } 37 | 38 | 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /graehl/shared/stringkey.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #include "stringkey.h" 15 | 16 | namespace graehl { 17 | StringKey StringKey::empty(""); 18 | } 19 | -------------------------------------------------------------------------------- /graehl/shared/strstrsep.h: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef STRSTRSEP_H 15 | #define STRSTRSEP_H 16 | 17 | char *strstrsep(char **stringp, const char *delim); 18 | char *strsep_(char **stringp, const char *delims); 19 | 20 | /** 21 | strsep(stringp, " \t\n"). 22 | */ 23 | inline char* strsepspaces(char **stringp) { 24 | char* s; 25 | if ((s = *stringp) == NULL) return NULL; 26 | char c; 27 | for (char *tok = s;;) { 28 | c = *s++; 29 | if (!c) { 30 | *stringp = NULL; 31 | return tok; 32 | } else if (c == ' ' || c == '\n' || c == '\t') { 33 | s[-1] = 0; 34 | *stringp = s; 35 | return tok; 36 | } 37 | } 38 | /* NOTREACHED */ 39 | } 40 | 41 | static inline char *unstrstr(char *lasttok, char *begin) { 42 | while (--lasttok >= begin) { 43 | if (*lasttok == 0) 44 | *lasttok = ' '; 45 | } 46 | return begin; 47 | } 48 | 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /graehl/shared/test/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | g++ -ggdb -O0 -I/home/graehl/t -I/home/graehl/src/boost tree.cpp -o tree 3 | # g++ -I.. -I../../boost slist.cpp -o slist 4 | # g++ -I.. -I../../boost weight_underflow.cpp -o weight_underflow 5 | -------------------------------------------------------------------------------- /graehl/shared/test/backtrace.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #define DEBUG 15 | #define MAIN 16 | 17 | #include "debugprint.hpp" 18 | 19 | void joe() { 20 | BACKTRACE; 21 | throw std::logic_error("something went wrong."); 22 | } 23 | 24 | void murphy() { 25 | BACKTRACE; 26 | joe(); 27 | } 28 | 29 | int main() 30 | { 31 | DBPC2("hi",1); 32 | try { 33 | BACKTRACE; 34 | murphy(); 35 | } catch (std::exception &e) { 36 | std::cerr << e.what() << std::endl; 37 | BackTrace::print_on(std::cerr); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /graehl/shared/test/epsilon.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | using namespace std; 22 | using namespace graehl; 23 | 24 | template 25 | void showApart(Float a, Float b) { 26 | cout<<"a="< 15 | 16 | #define MAIN 17 | //#define SINGLE_PRECISION 18 | #define DOUBLE_PRECISION 19 | 20 | #include 21 | #include 22 | 23 | int main(int argc, char *argv[]) 24 | { 25 | using namespace graehl; 26 | using namespace std; 27 | 28 | if (argc<2) { 29 | cerr<<"argument: tree with int labels"; 30 | return -1; 31 | } 32 | tree t; 33 | std::string s(argv[1]); 34 | string_to(argv[1],t); 35 | cout << t << "\n"; 36 | return 0; 37 | } 38 | -------------------------------------------------------------------------------- /graehl/shared/the_null_ostream.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL__SHARED__THE_NULL_OSTREAM_HPP 15 | #define GRAEHL__SHARED__THE_NULL_OSTREAM_HPP 16 | 17 | #include 18 | 19 | #ifdef GRAEHL__SINGLE_MAIN 20 | # define GRAEHL__NULL_OSTREAM_MAIN 21 | #endif 22 | 23 | #ifdef GRAEHL__NULL_OSTREAM_MAIN 24 | null_ostream the_null_ostream; 25 | #else 26 | /// singleton/constant (only need one) 27 | extern null_ostream the_null_ostream; 28 | #endif 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /graehl/shared/time_space_report.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL__SHARED__TIME_SPACE_REPORT_HPP 15 | #define GRAEHL__SHARED__TIME_SPACE_REPORT_HPP 16 | 17 | #include 18 | #include 19 | 20 | namespace graehl { 21 | 22 | struct time_space_change 23 | { 24 | static char const* default_desc() 25 | { return "\ntime and memory used: "; } 26 | time_change tc; 27 | memory_change mc; 28 | void print(std::ostream &o) const 29 | { 30 | o << tc << ", memory " << mc; 31 | } 32 | 33 | typedef time_space_change self_type; 34 | TO_OSTREAM_PRINT 35 | }; 36 | 37 | typedef auto_report time_space_report; 38 | 39 | } 40 | 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /graehl/shared/unimplemented.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef GRAEHL_SHARED__UNIMPLEMENTED_HPP 15 | #define GRAEHL_SHARED__UNIMPLEMENTED_HPP 16 | 17 | #include 18 | 19 | namespace graehl { 20 | 21 | struct unimplemented_exception : public std::runtime_error 22 | { 23 | unimplemented_exception(char const* c) : std::runtime_error(c) { } 24 | }; 25 | 26 | inline void unimplemented(char const* m="unimplemented") { 27 | throw unimplemented_exception(m); 28 | } 29 | 30 | } 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /graehl/shared/warning_pop.h: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl-http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #include 15 | #ifdef _MSC_VER 16 | #pragma warning(pop) 17 | #elif defined(__clang__) 18 | #pragma clang diagnostic pop 19 | #elif HAVE_DIAGNOSTIC_PUSH 20 | #pragma GCC diagnostic pop 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /graehl/shared/warning_push.h: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl-http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #include 15 | #ifdef _MSC_VER 16 | #pragma warning(push) 17 | #elif defined(__clang__) 18 | #pragma clang diagnostic push 19 | #elif HAVE_DIAGNOSTIC_PUSH 20 | #pragma GCC diagnostic push 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /graehl/shared/weight.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #include 15 | #include 16 | 17 | namespace graehl { 18 | 19 | } 20 | -------------------------------------------------------------------------------- /sblm/Makefile: -------------------------------------------------------------------------------- 1 | TRUNK=../.. 2 | LOCAL=/home/nlg-02/graehl/isd/hpc-opteron 3 | BOOST=$(LOCAL)/include 4 | BOOSTLIB=$(LOCAL)/lib 5 | 6 | vpath %.cpp .:$(TRUNK)/xrsparse/src 7 | 8 | clean: 9 | rm -f *.o xrs-pcfg-events 10 | 11 | xrs-pcfg-events: xrs.cpp xrs-pcfg-events.cpp 12 | g++ -o $@ $^ -I$(TRUNK)/xrsparse -I $(TRUNK)/gusc -I$(BOOST) -L$(BOOSTLIB) -lboost_thread -pthread 13 | -------------------------------------------------------------------------------- /sblm/README: -------------------------------------------------------------------------------- 1 | lexical items are quoted as in sbmt rules. they're lexical because they're non-variable tree leaves. but in the event file we strip off the variable prefix. so we leave the quotes to distinguish the 2 in PCFG rhs 2 | 3 | had-pcfg-probs - hadoop driver. to test: local=1 ~/blobs/sblm/latest/had-pcfg-probs 1000.eng-parse 4 | 1000.eng-parse - some trees in ghkm-input format. 5 | pcfg-map - output sblm pcfg events 6 | fast-lhs-sums-map - produce sblm lhs counts 7 | cat-pcfg-for-divide - produce event count lhs-sum. prefaced with (TOTAL_NT) and (TOTAL_LEX) sums for unigram bo. 8 | add-pcfg-feature - for had-rules, output id\tfeats\n 9 | 10 | had-rules --pcfg=training.pcfg-counts 11 | -------------------------------------------------------------------------------- /sblm/TODO: -------------------------------------------------------------------------------- 1 | test everything. 2 | 3 | had-* works 4 | add-* works 5 | 6 | what about had-rules pipeline integration? - test it, may work. argument is had-rules --pcfg=training.pcfg-counts or had-rules -p training.pcfg-counts 7 | 8 | length distribution in pcfg backoff - exponential p(stop)? 9 | 10 | unigram backoff given parent, child index -> parent,* -> * 11 | 12 | no. either you saw rewrite 13 | 14 | backoff features get fixed weight, or some fixed method? 15 | 16 | SGT? other smoothing of counts per lhs? binned counts? #1count pcfg rewrites feat? 17 | 18 | nice thing about non-tuned params: measure ppx of data 19 | 20 | validate smoothing methods using held-out trees. 21 | 22 | -------------------------------------------------------------------------------- /sblm/count.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, itertools 3 | 4 | # count.py 5 | # input: key \t ... \t count 6 | # output: key \t sum 7 | 8 | def stdinfields(): 9 | for line in sys.stdin: 10 | yield line.rstrip().split("\t") 11 | 12 | if __name__ == "__main__": 13 | for (key,records) in itertools.groupby(stdinfields(), lambda r: r[0]): 14 | sumcount = sum(int(r[-1]) for r in records) 15 | print "%s\t%s" % (key, sumcount) 16 | 17 | -------------------------------------------------------------------------------- /sblm/dumpx.py: -------------------------------------------------------------------------------- 1 | ../gextract/dumpx.py -------------------------------------------------------------------------------- /sblm/example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env pypy 2 | #-*- python -*- 3 | 4 | #hadoop mapper for PCFG: sbmt training format tree input -> parent children\t1 5 | 6 | version="0.1" 7 | 8 | test=True 9 | test_in='1000.eng-parse' 10 | default_in=None 11 | 12 | import os,sys 13 | sys.path.append(os.path.dirname(sys.argv[0])) 14 | 15 | import unittest 16 | 17 | import tree 18 | import optparse 19 | 20 | from graehl import * 21 | from dumpx import * 22 | 23 | ### main: 24 | 25 | def main(opts): 26 | log("pcfg-map v%s"%version) 27 | log(' '.join(sys.argv)) 28 | 29 | import optfunc 30 | @optfunc.arghelp('input','input file here (None = STDIN should be default in production)') 31 | 32 | def options(input=default_in,test=test): 33 | if test: 34 | sys.argv=sys.argv[0:1] 35 | input=test_in 36 | main(Locals()) 37 | 38 | optfunc.main(options) 39 | 40 | -------------------------------------------------------------------------------- /sblm/fast-lhs-sums-map: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # faster: fast-lhs-sums-map | sort | count.py 4 | # warning: set high enough timeout! you will see no output until end. 5 | # TODO: 6 | # TODO: cause output to happen every N sec. instead, stderr progress 7 | 8 | # slower: lhs-sums-map | precombine.py | count.py | sort | count.py 9 | 10 | # input: LHS ... count 11 | # (any whitespace terminates LHS) 12 | 13 | # output: LHS\tSUM 14 | 15 | # could have made output repeat LHS (preserve original line incl. exact whitespace). but didn't. 16 | 17 | my %c; 18 | my $lil=1000; 19 | my $big=$lil*70; 20 | select STDERR; 21 | $|=1; 22 | select STDOUT; 23 | while(<>) { 24 | print STDERR "." unless $. % $lil; 25 | print STDERR "$.\n" unless $. % $big; 26 | # my ($r,$rest)=split / /,$_,2; 27 | /^(\S+).*\t(\S+)\s*$/ or die "expected lhs,...,TAB,count,NEWLINE in $_"; 28 | $c{$1}+=$2; 29 | } 30 | print STDERR "\nDONE.\n"; 31 | for (keys %c) { 32 | # &debug('event',$_,$c{$_}); 33 | print "$_\t$c{$_}\n"; 34 | } 35 | 36 | -------------------------------------------------------------------------------- /sblm/graehl.py: -------------------------------------------------------------------------------- 1 | ../gextract/graehl.py -------------------------------------------------------------------------------- /sblm/lhs-sums-map: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # input: LHS ... 4 | # (any whitespace terminates LHS) 5 | 6 | # output: LHS\t... 7 | 8 | # could have made output repeat LHS (preserve original line incl. exact whitespace). but didn't. 9 | 10 | while(<>) { 11 | my ($r,$rest)=split ' ',$_,2; 12 | print $r,"\t",$rest; 13 | } 14 | -------------------------------------------------------------------------------- /sblm/optfunc.py: -------------------------------------------------------------------------------- 1 | ../gextract/optfunc.py -------------------------------------------------------------------------------- /sblm/pcfg-backoff: -------------------------------------------------------------------------------- 1 | #-*- python -*- 2 | 3 | # using backoff.py toolkit (should eventually allow mapreduce) for training and evaluating a simple PCFG sblm 4 | 5 | import os,sys 6 | sys.path.append(os.path.realpath(os.path.dirname(sys.argv[0]))) 7 | 8 | from graehl import * 9 | from dumpx import * 10 | from pcfg import * 11 | from backoff import * 12 | 13 | class PCFG(Model): 14 | pass 15 | 16 | optfunc.main(backoff_main_opts) 17 | -------------------------------------------------------------------------------- /sblm/pcfg-map: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env pypy 2 | #-*- python -*- 3 | 4 | #hadoop mapper for PCFG: 5 | #sbmt training format trees on stdin (or -input) 6 | #print PARENT CHILDREN+\t1\n 7 | 8 | version="0.1" 9 | 10 | test=True 11 | test_in='10.eng-parse' 12 | default_in='-' 13 | 14 | import os,sys 15 | sys.path.append(os.path.realpath(os.path.dirname(sys.argv[0]))) 16 | 17 | import unittest 18 | 19 | import tree 20 | import optparse 21 | 22 | from graehl import * 23 | from dumpx import * 24 | from pcfg import * 25 | 26 | 27 | ### main: 28 | 29 | def print_pcfg_event(t,digit2at=True,out=sys.stdout): 30 | ev=sbmt_lhs_pcfg_event(t,digit2at) 31 | # if ev is None: return 32 | out.write(event2str(ev)) 33 | out.write("\t1\n") 34 | 35 | 36 | def main(opts): 37 | log("pcfg-map v%s"%version) 38 | log(' '.join(sys.argv)) 39 | for line in open_in(opts.input): 40 | t=raduparse(line) 41 | if t is None: 42 | continue 43 | for n in t.preorder(): 44 | print_pcfg_event(n,opts.digit2at,sys.stdout) 45 | 46 | import optfunc 47 | @optfunc.arghelp('input','input file here (- means STDIN)') 48 | 49 | def options(input=default_in,test=False,digit2at=True): 50 | if test: 51 | sys.argv=sys.argv[0:1] 52 | input=test_in 53 | main(Locals()) 54 | 55 | optfunc.main(options) 56 | 57 | -------------------------------------------------------------------------------- /sblm/pcfg-map-precomb: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | d=$(readlink -nfs $(dirname $0)) 3 | buflines=${buflines:-1000000} 4 | $d/pcfg-map | $d/precombine.py -b $buflines | $d/count.py 5 | -------------------------------------------------------------------------------- /sblm/precombine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, getopt, collections 3 | 4 | # precombine.py [-k ] [-b ] 5 | # prepare map output for input to a combiner 6 | # = number of key fields (default 1) 7 | # = maximum number of records in buffer (default 100000) 8 | 9 | if __name__ == "__main__": 10 | opts, args = getopt.gnu_getopt(sys.argv[1:], 'k:b:') 11 | opts = dict(opts) 12 | 13 | n_keys = int(opts.get('-k', 1)) 14 | buf_size = int(opts.get('-b', 100000)) 15 | 16 | buf = collections.defaultdict(list) 17 | count = 0 18 | for line in sys.stdin: 19 | record = line.rstrip().split('\t') 20 | key = tuple(record[:n_keys]) 21 | buf[key].append(record) 22 | count += 1 23 | 24 | if count >= buf_size: 25 | for key, records in buf.iteritems(): 26 | for record in records: 27 | print "\t".join(record) 28 | buf.clear() 29 | count = 0 30 | 31 | for key, records in buf.iteritems(): 32 | for record in records: 33 | print "\t".join(record) 34 | -------------------------------------------------------------------------------- /sblm/rules: -------------------------------------------------------------------------------- 1 | NP-0(NESTED(JJ("bonus") PIG("pig") NESTED(JJ("bonus") x1:NP-0)) NN-2 ("volume") x0:NNS-0) -> "pommes" "frites" x0 ### count=3 id=122 2 | NP-0(NESTED(JJ("bonus") PIG("pig") NESTED(JJ("bonus") x0:NP-0)) NN-2 ("volume") x1:NNS-0) -> "pommes" x1 "frites" x0 ### count=3 id=121 3 | NP-0(NNP-0 ("agency") NNS-0("proposals")) -> "frites" ### id=124 count=1 4 | NP-0(NNP-0( "agency") NNS-0("proposals")) -> "pommes" "frites" ### id=123 count=3 5 | NP-0(NNP-0( "agency") NNS-0("proposals")) -> "frites" ### id=125 count=1 6 | NPB-0(NNP-0( "agency") NNS-0("proposals")) -> "pommes" "frites" ### id=126 count=3 7 | NPB-0(NNP-0( "agency") NNS-0("proposals")) -> "frites" ### id=127 count=1 8 | -------------------------------------------------------------------------------- /sblm/test.sh: -------------------------------------------------------------------------------- 1 | . ~graehl/isd/hints/bashlib.sh 2 | export PATH=~graehl/t/graehl/util:$PATH 3 | in=${1:-10.eng-parse} 4 | pre=${pre:-{$in%.eng-parse}.} 5 | showvars_required in pre 6 | export local=1 7 | savemap=tmp.count.map iomr-hadoop $in ${pre}counted ./pcfg-map ./count.py 8 | savemap=tmp.sums.map iomr-hadoop ${pre}counted ${pre}lhs-sums ./lhs-sums-map ./count.py 9 | ./lhs-sums-map ${pre}counted | mapsort | ./count.py > ${pre}lhs-sums 10 | ./cat-pcfg-for-divide ${pre}lhs-sums ${pre}counted 11 | -------------------------------------------------------------------------------- /sblm/tree.py: -------------------------------------------------------------------------------- 1 | ../gextract/tree.py -------------------------------------------------------------------------------- /util/.gdbinit: -------------------------------------------------------------------------------- 1 | catch throw 2 | r 3 | -------------------------------------------------------------------------------- /util/.gitignore: -------------------------------------------------------------------------------- 1 | .gitconfig 2 | -------------------------------------------------------------------------------- /util/.octaverc: -------------------------------------------------------------------------------- 1 | setenv("GNUTERM","x11") 2 | PS1 ">> " 3 | -------------------------------------------------------------------------------- /util/.svn.authorsfile: -------------------------------------------------------------------------------- 1 | mhopkins = Mark Hopkins 2 | mdreyer = Markus Dryer 3 | jmay = Jonathan May 4 | skohli = Saiyam Kohli 5 | zwang = Ziyuan Wang 6 | graehl = Jonathan Graehl 7 | jgraehl = Jonathan Graehl 8 | jturian = Joseph Turian 9 | marcu = Daniel Marcu 10 | olegb = Oleg Botchkarev 11 | ithayer = Ignacio Thayer 12 | pust = Michael Pust 13 | wwang = Wei Wang 14 | quamrul = Quamrul Tipu 15 | lhuang = Liang Huang 16 | -------------------------------------------------------------------------------- /util/C-small.cc: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | */ 4 | 5 | #define MAXCASES 100 6 | #include "codejam.hh" 7 | 8 | struct Case : CaseBase { 9 | I y; 10 | void read() { 11 | y = gety(); 12 | } 13 | I gety() { 14 | return -1; 15 | } 16 | void print() { 17 | PUTU(y); 18 | } 19 | void show1() { cerr << " => " << y; } 20 | void solve() {} 21 | }; 22 | 23 | CASES_MAIN(Case) 24 | -------------------------------------------------------------------------------- /util/addlicense.sh: -------------------------------------------------------------------------------- 1 | addlicense() { 2 | tmpfile=$(mktemp ${tmpdir:-/tmp}/license.XXXXXX) 3 | for f in "$@"; do 4 | if grep -q "WARRANT" $f; then 5 | echo "$f had a WARRANT string - licensed already?" 6 | head -10 $f 7 | echo ... 8 | echo 9 | else 10 | cat $license $f > $tmpfile && mv $tmpfile $f 11 | fi 12 | done 13 | } 14 | LICENSE_DIR=${LICENSE_DIR:-`dirname $0`} 15 | findc() { 16 | find . -name '*.hpp' -o -name '*.cpp' -o -name '*.ipp' -o -name '*.cc' -o -name '*.hh' -o -name '*.c' -o -name '*.h' 17 | } 18 | addlicenses() { 19 | local license=${1:-$LICENSE_DIR/license.txt} 20 | if [[ -f $license ]] ; then 21 | addlicense `findc` 22 | else 23 | echo "usage: cd src; addlisencec ../license.txt" 24 | fi 25 | } 26 | -------------------------------------------------------------------------------- /util/alignment-links.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | usage = ''' 4 | Show e,f,a (tab separated) in a more legible format with alignment links after word {#i:j k}. e and f are on alternating lines of output 5 | alignment a is pairs (s t)* where s 0-based indexes e, t 0-based indexes f. indices > #words (space sep) in e or f are considered NULL alignments (not aligned to any word) and ignored. 6 | ''' 7 | 8 | import argparse 9 | import sys 10 | 11 | parser=argparse.ArgumentParser(description=usage) 12 | 13 | def aword(i, a, w): 14 | return '{%d:%s}%s' % (i, ' '.join(map(str, a)), w) 15 | 16 | def awords(a, w): 17 | return ' '.join(aword(i, a[i], w[i]) for i in range(len(w))) 18 | 19 | 20 | def forfiles(infiles): 21 | for f in infiles: 22 | for line in f: 23 | forline(line) 24 | 25 | def forline(line): 26 | fields = line.split('\t') 27 | if len(fields) < 3: return 28 | S, T, A = fields[-3:] 29 | S = S.split() 30 | T = T.split() 31 | A = A.split() 32 | al = [(int(A[i]), int(A[i+1])) for i in range(0, len(A), 2)] 33 | s2t = [[] for _ in S] 34 | t2s = [[] for _ in T] 35 | for s,t in al: 36 | if s < len(S) and t < len(T): 37 | s2t[s].append(t) 38 | t2s[t].append(s) 39 | print(awords(s2t, S)) 40 | print(awords(t2s, T)) 41 | print 42 | 43 | def main(infiles): 44 | forfiles([open(x, 'r') for x in infiles] if len(infiles) else [sys.stdin]) 45 | 46 | if __name__ == '__main__': 47 | main(sys.argv[1:]) 48 | -------------------------------------------------------------------------------- /util/bash.txt: -------------------------------------------------------------------------------- 1 | inside [[ 2 | 3 | || logical or (double brackets only) 4 | && logical and (double brackets only) 5 | < string comparison (no escaping necessary within double brackets) 6 | -lt numerical comparison 7 | = string equality 8 | == string matching with globbing (double brackets only, see below) 9 | =~ string matching with regular expressions (double brackets only , see below) 10 | -n string is non-empty 11 | -z string is empty 12 | -eq numerical equality 13 | 14 | -ne numerical inequality 15 | 16 | [ "$t" == abc* ]] # true (globbing) 17 | [[ "$t" == "abc*" ]] # false (literal matching) 18 | [[ "$t" =~ [abc]+[123]+ ]] # true (regular expression) 19 | [[ "$t" =~ "abc*" ]] # false (literal matching) 20 | 21 | Note, that starting with bash version 3.2 the regular or globbing expression 22 | must not be quoted. If your expression contains whitespace you can store it in a variable: 23 | r="a b+" 24 | [[ "a bbb" =~ $r ]] # true 25 | 26 | 27 | Avoiding Temporary Files 28 | 29 | Some commands expect filenames as parameters so straightforward pipelining does not work. 30 | This is where <() operator comes in handy as it takes a command and transforms it into something 31 | which can be used as a filename: 32 | 33 | # download and diff two webpages 34 | diff <(wget -O - url1) <(wget -O - url2) 35 | -------------------------------------------------------------------------------- /util/c++space: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -i~ 2 | while(<>) { 3 | if (/^( +)}\s*$/) { 4 | print $delay if ($delay); 5 | $indentlvl = length $1; 6 | $delay=$_; 7 | } else { 8 | if ($delay) { 9 | if (/^( +)(else.*)$/ && length($1)==$indentlvl) { 10 | $_="$1} $2\n"; 11 | } else { 12 | print $delay; 13 | } 14 | $delay=undef; 15 | } 16 | s/ (if|for|while|switch|foreach)\(/ $1 (/; 17 | s/(?<=\S){/ {/; 18 | s/\):/) :/; 19 | print; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /util/ccache-wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | name=`basename $0` 3 | d=`dirname $0` 4 | if [[ $d != . ]] ; then 5 | export PATH=`dirname $0`:$PATH 6 | fi 7 | ccbasename=${name#ccache-} 8 | CCACHE_DIR=${CCACHE_DIR:-/local/graehl/ccache} 9 | mkdir -p $CCACHE_DIR || CCACHE_DIR= 10 | if [[ -d $CCACHE_DIR ]] ; then 11 | export CCACHE_DIR=$CCACHE_DIR 12 | fi 13 | exec ccache $ccbasename "$@" 14 | -------------------------------------------------------------------------------- /util/charvocab.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | from __future__ import print_function 3 | import sys 4 | import codecs 5 | from collections import Counter 6 | 7 | # python 2/3 compatibility 8 | if sys.version_info < (3, 0): 9 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 10 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 11 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin) 12 | else: 13 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) 14 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) 15 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) 16 | 17 | c = Counter() 18 | 19 | for line in sys.stdin: 20 | for char in line.rstrip("\r\n"): 21 | c[char] += 1 22 | 23 | for key,f in sorted(c.items(), key=lambda x: x[1], reverse=True): 24 | print("%s %s # U+%04x"%(key, f, ord(key))) 25 | -------------------------------------------------------------------------------- /util/check-condor: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | condor_q -format "%s " ClusterId -format "%s " Iwd -format "%s\n" Out \ 4 | | sort -k2 \ 5 | | perl -MTerm::ANSIColor -ane '$dir=$F[1]; if($dir ne $prev_dir) {print ($.>1?"\n":""); print color "bold blue"; ++$cnt_dirs; print "($cnt_dirs) $dir\n"; print color "reset"; $cnt_jobs=0} ++$cnt_jobs; 6 | my $host=""; 7 | my $g = $F[2]; 8 | $g =~ s/.out(put)?$//; 9 | my $hostf = "$dir/$g.err.$F[0]"; 10 | if ( -f $hostf ) { 11 | my $fh; 12 | open $fh,$hostf or die "opening $hostf"; 13 | my $l = <$fh>; 14 | chomp $l; 15 | $host="$1" if $l =~ /on (.*)$/; 16 | $hosts{$host} = 1; 17 | } 18 | print "$cnt_jobs $F[0] | $F[2] $host\n"; $prev_dir=$dir; 19 | END { foreach (keys %hosts) { 20 | $out=`set -x;ssh -o StrictHostKeyChecking=no -o BatchMode=yes -o ConnectTimeout=20 $_ "top -b -n 1 -u $ENV{USER} | grep '"'^ *[0-9]'"' | grep -v top | grep -v grep | grep -v ssh | grep -v bash | grep -v perl"`; 21 | chomp $out; 22 | print "$_ $out\n"; 23 | }} 24 | ' 25 | -------------------------------------------------------------------------------- /util/close-ns-inplace.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -i~ 2 | 3 | my @lines; 4 | 5 | while (<>) { 6 | push @lines, $_; 7 | if (eof(ARGV)) { 8 | $i=$#lines; 9 | $m=$i - 10; 10 | $m = 0 if ($m < 0); 11 | while($i > $m && $lines[$i] =~ /^#endif|^\s*$/) { --$i; } 12 | if ($lines[$i] =~ s#^(}+) *//(ns|.*namespace).*$#$1#) { 13 | if ($i >= 2) { 14 | if ($lines[$i-1] =~ /\S/) { 15 | $lines[$i-1] .= "\n\n"; 16 | } elsif ($lines[$i-2] =~ /\S/) { 17 | $lines[$i-2] .= "\n"; 18 | } 19 | } 20 | } else { 21 | $e=$i; 22 | while($i > $m && $lines[$i] =~ /^}\s*$/) { --$i; } 23 | $nclose = $e - $i; 24 | if ($nclose) { 25 | $s = $i; 26 | while ($s > 0 && $lines[$s] =~ /^\s*$/) { --$s; } 27 | ++$s; 28 | splice @lines,$s,$e-$s+1,"\n","\n",('}' x $nclose)."\n"; 29 | } 30 | } 31 | print for (@lines); 32 | @lines=(); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /util/codejam-example.cc: -------------------------------------------------------------------------------- 1 | #define MAXCASES 100 2 | #include "codejam.hh" 3 | 4 | struct Case : CaseBase { 5 | typedef map Runs; 6 | Runs runs; 7 | u64 N, K; 8 | u64 floorhalf; 9 | u64 ceilhalf; 10 | u64 take() { 11 | assert(!runs.empty()); 12 | auto i = runs.end(); 13 | --i; 14 | u64 k = i->first; 15 | U& n = i->second; 16 | if (!--n) runs.erase(i); 17 | return k; 18 | } 19 | void enter() { 20 | u64 last = take(); 21 | assert(last); 22 | --last; 23 | floorhalf = last / 2; 24 | ceilhalf = last - floorhalf; 25 | add(floorhalf); 26 | add(ceilhalf); 27 | } 28 | void add(u64 x) { 29 | if (x) ++runs[x]; 30 | } 31 | 32 | void read() { 33 | N = GETu64; 34 | K = GETu64; 35 | assert(N); 36 | assert(K); 37 | assert(K <= N); 38 | runs.clear(); 39 | runs[N] = 1; 40 | } 41 | void print() { 42 | putU(ceilhalf); 43 | putsp(); 44 | putU(floorhalf); 45 | } 46 | void show1() { 47 | } 48 | void solve() { 49 | for (u64 i = 0; i < K; ++i) enter(); 50 | } 51 | }; 52 | 53 | CASES_MAIN(Case) 54 | -------------------------------------------------------------------------------- /util/color.xetex: -------------------------------------------------------------------------------- 1 | \PassOptionsToPackage{dvipsnames,usenames}{color} 2 | \usepackage{color} 3 | -------------------------------------------------------------------------------- /util/config.fish: -------------------------------------------------------------------------------- 1 | function l --description 'List entire contents of directory using long format' 2 | ls -lah $argv 3 | end 4 | 5 | function lt --description 'List (by time) entire contents of directory using long format' 6 | ls -lhrt $argv 7 | end 8 | -------------------------------------------------------------------------------- /util/datespan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env pypy 2 | from graehl import * 3 | import sys 4 | if __name__ == '__main__': 5 | for x in sys.argv[1:]: 6 | l,u=filedaterange(x,False) 7 | sys.stderr.write('%s %s\n'%(l,u)) 8 | a=u-l if (u is not None and l is not None) else '???' 9 | l,u=(min(nonone((ctime(x),l))),max(nonone((mtime(x),u)))) 10 | sys.stderr.write('%s %s\n'%(l,u)) 11 | b=u-l 12 | print '%s\n%s\n\n'%(a,b), 13 | -------------------------------------------------------------------------------- /util/dictdiff: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env pypy 2 | #-*- python -*- 3 | from graehl import * 4 | from nbest import * 5 | from dumpx import * 6 | usage=""" 7 | given two input files, print sorted (optionally absolute) differences between their numeric f=val. 8 | (note: multiply appearing keys use the last value) 9 | """ 10 | 11 | def main(rest_=[],sortabs=False,bypercent=False,alphabetical=False,usage_=usage,header=True): 12 | fnames=rest_ 13 | fs=map(readfrom,fnames) 14 | if len(fs)!=2: 15 | error(usage) 16 | desc='NAME\t[%s] - [%s]\tPERCENT CHANGE'%fnames 17 | ds=[] 18 | for f in fs: 19 | d=dict() 20 | for l in readfrom(f): 21 | for k,v in yieldfields_num(l): 22 | d[k]=float(v) 23 | ds.append(d) 24 | #ds=[dict(flatten(yieldfields_num(l) for l in readfrom(f))) for f in fs] #dict to remove dups. flatten to combine across all file lines 25 | da,db=ds 26 | absf=abs if sortabs else identity 27 | dd=[(k,v,v/max(abs(da.get(k,0.)),abs(db.get(k,0.)))) for (k,v) in dict_diff(da,db,diff).iteritems()] 28 | if alphabetical: 29 | keyf=identity 30 | elif percent: 31 | keyf=lambda x:(absf(x[2]),absf(x[1])) 32 | else: 33 | keyf=lambda x:absf(x[1]) 34 | dd.sort(key=keyf) 35 | if header: 36 | print desc 37 | else: 38 | info(desc) 39 | for k,v,frac in dd: 40 | print '%s\t%s\t%s'%(k,v,percent_change(frac)) 41 | import optfunc 42 | optfunc.main(main) 43 | -------------------------------------------------------------------------------- /util/dotprod.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | def readmap(f): 5 | m = {} 6 | if type(f) == str: 7 | f = open(f, 'r') 8 | for line in f: 9 | (k, v) = line.split() 10 | if k in m: 11 | raise "duplicate %s" % k 12 | m[k] = float(v) 13 | return m 14 | 15 | 16 | def dotprod(m1, m2): 17 | sum = 0.0 18 | for k in m1: 19 | if k in m2: 20 | v1 = m1[k] 21 | v2 = m2[k] 22 | sys.stderr.write('+ (%s * %s = %s) // %s\n' % (v1, v2, v1 * v2, k)) 23 | sum += v1 * v2 24 | return sum 25 | 26 | (f1, f2) = sys.argv[1:] 27 | m1 = readmap(f1) 28 | m2 = readmap(f2) 29 | print dotprod(m1, m2) 30 | -------------------------------------------------------------------------------- /util/dropcaches.c: -------------------------------------------------------------------------------- 1 | /** for linux, 2 | 3 | echo 3 | sudo tee /proc/sys/vm/drop_caches 4 | 5 | might work but might prompt for password 6 | 7 | you can't +suid a shell script, but you could +suid this. 8 | 9 | */ 10 | #include 11 | 12 | char const* const dropname = "/proc/sys/vm/drop_caches"; 13 | 14 | int main() { 15 | FILE *f = fopen(dropname, "w"); 16 | if (f) { 17 | fprintf(f, "3\n"); 18 | fclose(f); 19 | } else { 20 | fprintf(stderr, "Couldn't write to %s - you must run as (setuid) root on a Linux system?\n", dropname); 21 | return 1; 22 | } 23 | return 0; 24 | } 25 | -------------------------------------------------------------------------------- /util/dumpx.py: -------------------------------------------------------------------------------- 1 | ../gextract/dumpx.py -------------------------------------------------------------------------------- /util/edit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | case $(uname) in 3 | Darwin) 4 | lwarch=Apple 5 | ;; 6 | Linux) 7 | lwarch=FC12 8 | shopt -s globstar || true 9 | ;; 10 | *) 11 | lwarch=Windows ;; 12 | esac 13 | 14 | if [[ $lwarch = Windows ]] ; then 15 | emacscli=emacsclient 16 | emacssrv=Emacs 17 | else 18 | # emacsapp=/Applications/Emacs.app/Contents/MacOS/ 19 | emacsapp=/usr/local 20 | emacscli=$emacsapp/bin/emacsclient 21 | emacssrv=$emacsapp/bin/emacs 22 | fi 23 | 24 | if [[ $CONSOLE ]] ; then 25 | exec /usr/bin/emacs -nw "$@" 26 | else 27 | exec $emacscli -a $emacssrv "$@" 28 | fi 29 | -------------------------------------------------------------------------------- /util/etree.py: -------------------------------------------------------------------------------- 1 | ../sblm/etree.py -------------------------------------------------------------------------------- /util/featstats.py: -------------------------------------------------------------------------------- 1 | stats.py -------------------------------------------------------------------------------- /util/findscripts.sh: -------------------------------------------------------------------------------- 1 | interp=$1 2 | shift 3 | dir=${2:-.} 4 | for f in $(find $dir -type f -size -1000k ! -name '*~' ! -name '*svn*') ; do 5 | if head -1 $f | grep '^#!/' | fgrep -q "$interp" ; then 6 | echo $f 7 | fi 8 | done 9 | -------------------------------------------------------------------------------- /util/fix-include-guard-inplace.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -i~ 2 | 3 | my @lines; 4 | while (<>) { 5 | push @lines, $_; 6 | if (eof(ARGV)) { 7 | $i=$#lines; 8 | $e=$i; 9 | $m=$i - 10; 10 | $m = 0 if ($m < 0); 11 | @endif = (); 12 | while($i > $m) { 13 | $_ = $lines[$i]; 14 | if (/^\#endif/) { 15 | last if scalar @endif; 16 | @endif = ("\n#endif\n"); 17 | } elsif (/\S/) { 18 | last; 19 | } 20 | --$i; 21 | } 22 | $len = $e-$i; 23 | splice @lines,$i+1,$len,@endif if ($len > 0); 24 | print for (@lines); 25 | @lines=(); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /util/fixunrpn_: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | while(<>) { 4 | chomp; 5 | next unless /\t(.)(.*_)$/; 6 | $l1 = lc($1); 7 | print "$1$2\t$l1$2\n" if ($l1 ne $1); 8 | } 9 | -------------------------------------------------------------------------------- /util/float-round.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | use utf8; 4 | binmode STDIN, ':utf8'; 5 | binmode STDOUT, ':utf8'; 6 | 7 | my $default_precision=$ENV{DIGITS}||5; 8 | 9 | sub real_prec { 10 | my ($n,$prec)=@_; 11 | $prec=$default_precision unless defined $prec; 12 | sprintf("%.${prec}g",$n); 13 | } 14 | 15 | my $num_match=qr/[+\-]?(?:\.\d+|\d+(?:\.\d*)?(?:[eE][+\-]?\d+)?)/; 16 | 17 | while(<>) { 18 | s/($num_match)/real_prec($1)/eg; 19 | print; 20 | } 21 | -------------------------------------------------------------------------------- /util/forall.sub: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -i~ 2 | 3 | while(<>) { 4 | s{(?:BOOST_FOREACH|foreach) \(([^,)]+), (.+)+\)( |$)}{for ($1 : $2)$3}; 5 | print; 6 | } 7 | -------------------------------------------------------------------------------- /util/format-doxygen-c-comment: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w -i~ 2 | use strict; 3 | my $com = 0; 4 | my $wantspaces = 0; 5 | my $hang = 3; 6 | my $lastblank = 0; 7 | my $kwstarts = '\\'; 8 | while(<>) { 9 | chomp; 10 | if (m{^( *)(/\*\*)(.*)}) { 11 | my $body = $3; 12 | my $open = $2; 13 | my $space = $1; 14 | if (!m{\*/} || m{\*/\s*$}) { 15 | $wantspaces = length($space) + $hang; 16 | $com = 1; 17 | $lastblank = !($body =~ m{\S}); 18 | if ($body =~ s/^( *)[@\\]/$1$kwstarts/) { 19 | $_ = "$space$open$body"; 20 | } 21 | s/[@\\]brief ?//; 22 | $com = 0 if (m{\*/}); 23 | } 24 | } elsif ($com) { 25 | if (m{\*/}) { 26 | $com = 0; 27 | } elsif (m{\S}) { 28 | s/^( *)\* /$1 /; 29 | m{^( *)} || die; 30 | my $needspaces = $wantspaces - length($1); 31 | $_ = (' ' x $needspaces) . $_ if ($needspaces > 0); 32 | my $nl = $lastblank ? '' : "\n"; 33 | s/^( *)[@\\]/$nl$1$kwstarts/; 34 | s/[@\\]brief ?//; 35 | } 36 | $lastblank = !/\S/; 37 | } 38 | print $_,"\n"; 39 | } 40 | -------------------------------------------------------------------------------- /util/giraffe: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ~graehl/isd/hints/bashlib.sh 3 | unset PBS_O_WORKDIR 4 | echo $d/giraffe.0.3 $(realpath "$@") 5 | exec $d/giraffe.0.3 $(realpath "$@") 6 | -------------------------------------------------------------------------------- /util/gist: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | cmd = %Q{curl https://gist.github.com/gists -s -L -o /dev/null -w "%{url_effective} " } 4 | 5 | files = ARGV.empty? ? Dir["**/*"] : Dir[*ARGV].uniq 6 | 7 | files.select { |f| File.file?(f) }.each_with_index do |path, i| 8 | cmd << %Q{-F "file_ext[gistfile#{i}]=#{File.extname(path)[1..-1]}" } 9 | cmd << %Q{-F "file_name[gistfile#{i}]=#{File.basename(path)}" } 10 | cmd << %Q{-F "file_contents[gistfile#{i}]=<#{path}" } 11 | end 12 | 13 | exec cmd 14 | -------------------------------------------------------------------------------- /util/gitalias.sh: -------------------------------------------------------------------------------- 1 | git config --global alias.co checkout 2 | git config --global alias.br branch 3 | git config --global alias.ci commit 4 | git config --global alias.st status 5 | git config --global alias.unstage 'reset HEAD --' 6 | git config --global alias.last 'log -1 HEAD' 7 | -------------------------------------------------------------------------------- /util/gnuplot.auto.inc: -------------------------------------------------------------------------------- 1 | set ytics autofreq tc lt 1 2 | -------------------------------------------------------------------------------- /util/graehl.py: -------------------------------------------------------------------------------- 1 | ../gextract/graehl.py -------------------------------------------------------------------------------- /util/growth: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env pypy 2 | #-*- python -*- 3 | #TODO: approx #lines/sec (look in new bytes only) 4 | from graehl import * 5 | 6 | def mtime_age(path): 7 | return str(datenow()-mtime(path)) 8 | 9 | tsep='\t' 10 | fsep=' ' 11 | 12 | import time 13 | def main(rest_='',sleep=600,fields='SsMm',N=-1): 14 | logcmd() 15 | fs=rest_ 16 | captions={'S':'size','s':'size/sec','M':'mtime','m':'delta(mtime)'} 17 | #print "sleep=%s"%sleep 18 | print "start=%s"%datenow() 19 | for f in fs: print "file=%s"%f 20 | print tsep.join([captions[f] for f in fields]) 21 | sleep=float(sleep) 22 | s=None 23 | n=0 24 | a=dict() 25 | while True: 26 | if N>=0 and n>N: break 27 | s2=[(i,mtime(fs[i]),filesize(fs[i])) for i in range(len(fs))] 28 | if s is not None: 29 | a['M']=[str(x[1]) for x in s2] 30 | a['m']=[str(x[1]-s[x[0]][1]) for x in s2] 31 | a['S']=[mega(x[2]) for x in s2] 32 | a['s']=[mega((x[2]-s[x[0]][2])/sleep) for x in s2] 33 | print tsep.join(fsep.join(a[k]) for k in fields) 34 | time.sleep(sleep) 35 | s=s2 36 | n+=1 37 | import optfunc 38 | optfunc.main(main) 39 | -------------------------------------------------------------------------------- /util/hexnorm.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -i~ 2 | my %id; 3 | my $idre = qr/(?:0x|thread:)([0-9a-f]+)/; 4 | my $opre = '#x'; 5 | my $nid = 0; 6 | sub getid { 7 | my ($x) = @_; 8 | $opre.(exists $id{$x} ? $id{$x} : ($id{$x} = $nid++)); 9 | } 10 | while(<>) { 11 | s/$idre/getid($1)/eg; 12 | print; 13 | } 14 | -------------------------------------------------------------------------------- /util/indent-c-comment: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w -i~ 2 | use strict; 3 | my $com = 0; 4 | my $wantspaces = 0; 5 | my $hang = 3; 6 | while(<>) { 7 | if (m{^( *)/\*\*} && !m{\*/}) { 8 | $com = 1; 9 | $wantspaces = length($1) + $hang; 10 | } elsif ($com) { 11 | if (m{\*/}) { 12 | $com = 0; 13 | } elsif (m{\S}) { 14 | s/^( *)\* /$1 /; 15 | m{^( *)} || die; 16 | my $needspaces = $wantspaces - length($1); 17 | print ' ' x $needspaces if ($needspaces > 0); 18 | s/^( *)\@/$1\\/; 19 | } 20 | } 21 | print; 22 | } 23 | -------------------------------------------------------------------------------- /util/joinleft: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env pypy 2 | from graehl import * 3 | from collections import defaultdict 4 | 5 | def main(rest_=['-'],keyfields=1,sep='\t',npad=0,allow_over_npad=True,padval='',sort=True,header=False): 6 | t=ListDict() 7 | keys=[] 8 | i=0 9 | ncol=[0 for _ in rest_] 10 | for i,f in ival(rest_): 11 | for l in open_in(f): 12 | l=l.rstrip() 13 | f=l.split(sep,keyfields) 14 | k=tuple(f[:keyfields]) 15 | if k not in t: keys.append(k) 16 | v=f[keyfields:] 17 | maxeq(ncol,i,len(v)) 18 | at_expand(t[k],i,v,[]) 19 | #v=pad(v,npad,padval,npad==0) 20 | #t[k]+=v 21 | i+=1 22 | if sort: 23 | keys=sorted(keys) 24 | for i in indices[ncols]: 25 | ncols[k]=max(ncols[k],npad) if allow_over_npad else npad 26 | for k in keys: 27 | print sep.join(list(k)+flatlist(pad(l,ncol[i],pad=padval) for (i,l) in ival(t[k]))) 28 | 29 | import optfunc 30 | optfunc.main(main) 31 | -------------------------------------------------------------------------------- /util/license.txt: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Jonathan Graehl - http://graehl.org/ 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | -------------------------------------------------------------------------------- /util/localgcc.sh: -------------------------------------------------------------------------------- 1 | case $(uname) in 2 | Darwin) 3 | lwarch=Apple 4 | ;; 5 | Linux) 6 | lwarch=FC12 7 | shopt -s globstar || true 8 | ;; 9 | *) 10 | lwarch=Windows ;; 11 | esac 12 | gccprefix=${gccprefix:-/local/gcc} 13 | appendld() { 14 | if [[ $lwarch = Apple ]] ; then 15 | DYLD_FALLBACK_LIBRARY_PATH+=":$1" 16 | export DYLD_FALLBACK_LIBRARY_PATH=${DYLD_FALLBACK_LIBRARY_PATH#:} 17 | else 18 | LD_RUN_PATH+=":$1" 19 | export LD_RUN_PATH=${LD_RUN_PATH#:} 20 | LD_LIBRARY_PATH+=":$1" 21 | export LD_LIBRARY_PATH=${LD_RUN_PATH#:} 22 | fi 23 | } 24 | if [[ $NOLOCALGCC = 1 ]] ; then 25 | gccprefix= 26 | fi 27 | if [[ -d $gccprefix ]] ; then 28 | export PATH=$gccprefix/bin:$PATH 29 | appendld "$gccprefix/lib64" 30 | fi 31 | export CXX=ccache-g++ 32 | export CC=ccache-gcc 33 | -------------------------------------------------------------------------------- /util/mflist.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | my $d="\t"; 3 | while(<>) { 4 | last if /Most Recent Quarter Data/; 5 | } 6 | $l=10; 7 | while(<>) { 8 | if (/([^<]+)<\/td>/) { $n=$1; $l=0; } else { 9 | if (/([^<]+)<\/td>/) { 10 | if ($l==0) { 11 | $t=$1; 12 | } elsif ($l==1) { 13 | $c=$1; 14 | $c=~s/,//g;$c=int($c+.499); 15 | print "$t$d$c$d$n\n"; 16 | } 17 | } else { 18 | $l=10; 19 | } 20 | ++$l; 21 | } } 22 | -------------------------------------------------------------------------------- /util/nbest.py: -------------------------------------------------------------------------------- 1 | ../sblm/nbest.py -------------------------------------------------------------------------------- /util/nfeats: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env pypy 2 | #-*- python -*- 3 | usage=""" 4 | (from file weights, or from positional arguments (nbests)) take weight vector or nbest features, and print number of features starting with pre with abs(weight or value)>=epsilon 5 | """ 6 | from graehl import * 7 | from dumpx import * 8 | def main(pre='',epsilon=0,weights='',rest_=[],printfeats=False,usage_=usage): 9 | if len(weights): 10 | l=firstline(weights) 11 | fvs=[str2weights(l)] 12 | elif len(rest): 13 | fvs=flatten(dict(yieldfields_num(l) for l in readfrom(f)) for f in rest_) 14 | fs=set() 15 | for fv in fvs: 16 | for (f,v) in fv.iteritems(): 17 | if f.startswith(pre) and v>=epsilon: 18 | fs.add(f) 19 | dump(fs) 20 | if printfeats: 21 | for f in sorted(fs): 22 | print f 23 | print len(fs) 24 | 25 | import optfunc 26 | optfunc.main(main) 27 | 28 | -------------------------------------------------------------------------------- /util/no-trailing-space-inplace.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -i~ 2 | while(<>) { 3 | chomp; 4 | s/\s+$//; 5 | print $_,"\n"; 6 | } 7 | -------------------------------------------------------------------------------- /util/optfunc.py: -------------------------------------------------------------------------------- 1 | ../gextract/optfunc.py -------------------------------------------------------------------------------- /util/pandoc.constantia.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: auto; 3 | padding-right: 1em; 4 | padding-left: 1em; 5 | max-width: 44em; 6 | border-left: 1px solid black; 7 | border-right: 1px solid black; 8 | color: black; 9 | font-family: Verdana, sans-serif; 10 | font-size: 100%; 11 | line-height: 140%; 12 | color: #333; 13 | } 14 | pre { 15 | border: 1px dotted gray; 16 | background-color: #ececec; 17 | color: #1111111; 18 | padding: 0.5em; 19 | } 20 | code { 21 | font-family: monospace; 22 | } 23 | h1 a, h2 a, h3 a, h4 a, h5 a { 24 | text-decoration: none; 25 | color: #7a5ada; 26 | } 27 | h1, h2, h3, h4, h5 { font-family: verdana; 28 | font-weight: bold; 29 | border-bottom: 1px dotted black; 30 | color: #7a5ada; } 31 | h1 { 32 | font-size: 130%; 33 | } 34 | 35 | h2 { 36 | font-size: 110%; 37 | } 38 | 39 | h3 { 40 | font-size: 95%; 41 | } 42 | 43 | h4 { 44 | font-size: 90%; 45 | font-style: italic; 46 | } 47 | 48 | h5 { 49 | font-size: 90%; 50 | font-style: italic; 51 | } 52 | 53 | h1.title { 54 | font-size: 200%; 55 | font-weight: bold; 56 | padding-top: 0.2em; 57 | padding-bottom: 0.2em; 58 | text-align: left; 59 | border: none; 60 | } 61 | 62 | dt code { 63 | font-weight: bold; 64 | } 65 | dd p { 66 | margin-top: 0; 67 | } 68 | 69 | #footer { 70 | padding-top: 1em; 71 | font-size: 70%; 72 | color: gray; 73 | text-align: center; 74 | } 75 | -------------------------------------------------------------------------------- /util/pandoc.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: auto; 3 | padding-right: 1em; 4 | padding-left: 1em; 5 | max-width: 44em; 6 | border-left: 1px solid black; 7 | border-right: 1px solid black; 8 | color: black; 9 | font-family: Verdana, sans-serif; 10 | font-size: 100%; 11 | line-height: 140%; 12 | color: #333; 13 | } 14 | pre { 15 | border: 1px dotted gray; 16 | background-color: #ececec; 17 | color: #1111111; 18 | padding: 0.5em; 19 | } 20 | code { 21 | font-family: monospace; 22 | } 23 | h1 a, h2 a, h3 a, h4 a, h5 a { 24 | text-decoration: none; 25 | color: #7a5ada; 26 | } 27 | h1, h2, h3, h4, h5 { font-family: verdana; 28 | font-weight: bold; 29 | border-bottom: 1px dotted black; 30 | color: #7a5ada; } 31 | h1 { 32 | font-size: 130%; 33 | } 34 | 35 | h2 { 36 | font-size: 110%; 37 | } 38 | 39 | h3 { 40 | font-size: 95%; 41 | } 42 | 43 | h4 { 44 | font-size: 90%; 45 | font-style: italic; 46 | } 47 | 48 | h5 { 49 | font-size: 90%; 50 | font-style: italic; 51 | } 52 | 53 | h1.title { 54 | font-size: 200%; 55 | font-weight: bold; 56 | padding-top: 0.2em; 57 | padding-bottom: 0.2em; 58 | text-align: left; 59 | border: none; 60 | } 61 | 62 | dt code { 63 | font-weight: bold; 64 | } 65 | dd p { 66 | margin-top: 0; 67 | } 68 | 69 | #footer { 70 | padding-top: 1em; 71 | font-size: 70%; 72 | color: gray; 73 | text-align: center; 74 | } 75 | -------------------------------------------------------------------------------- /util/pcfg.py: -------------------------------------------------------------------------------- 1 | ../sblm/pcfg.py -------------------------------------------------------------------------------- /util/pychecks.sh: -------------------------------------------------------------------------------- 1 | d=`dirname $0` 2 | . $d/aliases.sh 3 | pycheckers $($d/findscripts.sh py "$@") 4 | -------------------------------------------------------------------------------- /util/random-c-array.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import md5 5 | import itertools 6 | 7 | def reproducible_random(seed): 8 | """ chain md5 """ 9 | hash = seed 10 | while True: 11 | digest = md5.md5(hash).digest() 12 | for c in digest: 13 | yield ord(c) 14 | hash = digest + hash[0:len(digest)] 15 | 16 | def usage(): 17 | sys.stderr.write("arg1 = # of bytes, arg2 = seed\n") 18 | sys.exit(1) 19 | 20 | def main(args): 21 | seed = "random-c-array.py-encrypted-seed-seed" 22 | n = 32 23 | if len(args) >= 1: 24 | n = int(args[0]) 25 | if len(args) == 2: 26 | seed =args[1] 27 | elif len(args) > 2: 28 | usage() 29 | sys.stderr.write('[n=%s] [seed=%s]\n'%(n, seed)) 30 | bytes = itertools.islice(reproducible_random(seed), n) 31 | print ', '.join(map(str, bytes)) 32 | 33 | if __name__ == '__main__': 34 | main(sys.argv[1:]) 35 | -------------------------------------------------------------------------------- /util/relpath: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Print relative path from $1 to $2 e.g. /a/b/c/d to /a/b/c1/d1 = ../../c1/d1 4 | # Author: Cimarron Taylor, graehl 5 | 6 | import os, sys 7 | 8 | def pathsplit(p, rest=[]): 9 | (h,t) = os.path.split(p) 10 | if len(h) < 1: return [t]+rest 11 | if len(t) < 1: return [h]+rest 12 | return pathsplit(h,[t]+rest) 13 | 14 | def commonpath(l1, l2, common=[]): 15 | if len(l1) < 1: return (common, l1, l2) 16 | if len(l2) < 1: return (common, l1, l2) 17 | if l1[0] != l2[0]: return (common, l1, l2) 18 | return commonpath(l1[1:], l2[1:], common+[l1[0]]) 19 | 20 | def relpath(p1, p2): 21 | (common,l1,l2) = commonpath(pathsplit(p1), pathsplit(p2)) 22 | p = [] 23 | if len(l1) > 0: 24 | p = [ '../' * len(l1) ] 25 | p = p + l2 26 | if len(p)==0: 27 | return '.' 28 | return os.path.join( *p ) 29 | 30 | if __name__ == '__main__': 31 | frompath = sys.argv[1] 32 | topath = sys.argv[2] 33 | print(relpath(os.path.abspath(frompath),os.path.abspath(topath))) 34 | -------------------------------------------------------------------------------- /util/relpathp: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Print relative path from $1 to $2 e.g. /a/b/c/d to /a/b/c1/d1 = ../../c1/d1 4 | # Author: Cimarron Taylor, graehl 5 | 6 | import os, sys 7 | 8 | def pathsplit(p, rest=[]): 9 | (h,t) = os.path.split(p) 10 | if len(h) < 1: return [t]+rest 11 | if len(t) < 1: return [h]+rest 12 | return pathsplit(h,[t]+rest) 13 | 14 | def commonpath(l1, l2, common=[]): 15 | if len(l1) < 1: return (common, l1, l2) 16 | if len(l2) < 1: return (common, l1, l2) 17 | if l1[0] != l2[0]: return (common, l1, l2) 18 | return commonpath(l1[1:], l2[1:], common+[l1[0]]) 19 | 20 | def relpath(p1, p2): 21 | (common,l1,l2) = commonpath(pathsplit(p1), pathsplit(p2)) 22 | p = [] 23 | if len(l1) > 0: 24 | p = [ '../' * len(l1) ] 25 | p = p + l2 26 | if len(p)==0: 27 | return '.' 28 | return os.path.join( *p ) 29 | 30 | def test(p1,p2): 31 | print("from", p1, "to", p2, " -> ", relpath(p1, p2)) 32 | 33 | if __name__ == '__main__': 34 | frompath = sys.argv[1] 35 | topath = sys.argv[2] 36 | home=os.path.abspath(frompath) 37 | (d,b)=os.path.split(topath) 38 | full=os.path.abspath(topath) 39 | (cd,_,_)=commonpath(home,d) 40 | (cfull,_,_)=commonpath(home,full) 41 | if len(cfull)>len(cd): 42 | print(relpath(home,full)) 43 | else: 44 | print(os.path.join(relpath(home,d),b)) 45 | -------------------------------------------------------------------------------- /util/sample/alignment-links.tsv: -------------------------------------------------------------------------------- 1 | a b wie andere ich meinen namen __LW_AT__? how do i change my name __LW_AT__? 0 6 4 5 3 4 5 3 2 2 0 1 4 0 2 | -------------------------------------------------------------------------------- /util/space-brace-inplace.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -i~ 2 | 3 | my $id = '[A-Za-z_][0-9A-Za-z_]*'; 4 | while(<>) { 5 | s/(struct|class)\s+($id)({)/$1 $2 $3/; 6 | s/(namespace)\s+($id)({)/$1 $2 $3/; 7 | print; 8 | } 9 | -------------------------------------------------------------------------------- /util/split.lua: -------------------------------------------------------------------------------- 1 | #!/home/graehl/torch/install/bin/luajit 2 | 3 | local function grjoined(tokens) 4 | return table.concat(tokens, ' ') 5 | end 6 | 7 | local function grsplit(str, sSeparator, nMax, bRegexp) 8 | sSeparator = sSeparator or ' ' 9 | assert(sSeparator ~= "") 10 | assert(nMax == nil or nMax >= 1) 11 | local aRecord = {} 12 | if str:len() > 0 then 13 | local bPlain = not bRegexp 14 | nMax = nMax or -1 15 | local nStart = 1, 1 16 | local nFirst, nLast = str:find(sSeparator, nStart, bPlain) 17 | while nFirst and nMax ~= 0 do 18 | table.insert(aRecord, str:sub(nStart, nFirst-1)) 19 | nStart = nLast + 1 20 | nFirst, nLast = str:find(sSeparator, nStart, bPlain) 21 | nMax = nMax - 1 22 | end 23 | table.insert(aRecord, str:sub(nStart)) 24 | end 25 | return aRecord 26 | end 27 | 28 | local function deBPE(tokens, bpecont) 29 | return grsplit(string.gsub(table.concat(tokens, ' '), bpecont .. ' ', ''), ' ') 30 | end 31 | -------------------------------------------------------------------------------- /util/splitutf8.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -CSDA 2 | use utf8; 3 | use 5.014; 4 | 5 | my $space = $ENV{chars} ? '' : ' '; 6 | 7 | while(<>) { 8 | chomp; 9 | my @f=split $space,$_; 10 | my $n = scalar @f; 11 | my $mid = int(($n + 1) / 2); 12 | my @right = splice @f, $mid; 13 | print STDERR join($space, @right),"\n"; 14 | print join($space, @f),"\n"; 15 | } 16 | -------------------------------------------------------------------------------- /util/start-hadoop: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #usage: `start-hadoop [-opts] [hadoop-dir]` 3 | # to set up environment. NOTE BACKTICKS. 4 | 5 | BLOBS=${BLOBS:-~graehl/blobs} 6 | . $BLOBS/bashlib/unstable/bashlib.sh 7 | 8 | pbshad=/home/nlg-02/pust/bin/pbs_hadoop.py 9 | ourhad=$d/pbs_hadoop.py 10 | [[ -x $ourhad && ! -x $pbshad ]] && pbshad=ourhad 11 | export HADOOP_DIR=$(scratchdir)/hadoop 12 | 13 | #make_nodefile #needs more work; e.g.: export PBS_ENVIRONMENT=PBS_INTERACTIVE 14 | 15 | exportenv() { 16 | echo export HADOOP_CONF_DIR=$HADOOP_DIR/conf 17 | echo export HADOOP_HOME=${HADOOP_HOME:-/home/nlg-01/chiangd/pkg/hadoop} 18 | } 19 | `exportenv` 20 | showvars_required pbshad HADOOP_DIR HADOOP_CONF_DIR 21 | exportenv 22 | $pbshad "$@" $HADOOP_DIR 1>&2 23 | 24 | -------------------------------------------------------------------------------- /util/stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env pypy 2 | from graehl import * 3 | from collections import defaultdict 4 | def stats_main(input='numbers.txt',mean=True,variance=True,stddev=True,error=True,sparse=True,skipblank=True): 5 | v=defaultdict(lambda:Stats(mean=mean,variance=variance,stddev=stddev,stderror=error)) 6 | if input=='-': 7 | input=sys.stdin 8 | if type(input)==str: 9 | input=open(input) 10 | N=0 11 | for line in input: 12 | fs=line.split() 13 | name=None 14 | haven=False 15 | for i in range(0,len(fs)): 16 | f=fs[i] 17 | if name is None: 18 | name=i 19 | try: 20 | e=f.find('=') 21 | if e>0: 22 | name=f[:e] 23 | ff=float(f[e+1:]) 24 | else: 25 | ff=float(f) 26 | v[name].count(ff) 27 | haven=True 28 | name=None 29 | except ValueError: 30 | name=f 31 | if haven or not skipblank: N+=1 32 | if sparse: 33 | for s in v.itervalues(): 34 | s.N=N 35 | write_dict(v) 36 | 37 | import optfunc 38 | optfunc.main(stats_main) 39 | -------------------------------------------------------------------------------- /util/subst.pypy.sh: -------------------------------------------------------------------------------- 1 | d=`dirname $0` 2 | #-perm -u+x 3 | $d/util/subst.pl "$@" -v -t $d/util/use-pypy.subst -i -e `find . -size -1000k ! -name '*~' ! -name '*svn*'` 4 | -------------------------------------------------------------------------------- /util/svndiff.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Configure your favorite diff program here. 3 | DIFF="diff" 4 | 5 | # Subversion provides the paths we need as the sixth and seventh 6 | # parameters. 7 | LEFT=${6} 8 | RIGHT=${7} 9 | 10 | nleft=${3} 11 | nright=${5} 12 | 13 | #. ~/u/bashlib.sh 14 | #showvars_required nleft LEFT nright RIGHT 15 | # Call the diff command (change the following line to make sense for 16 | # your merge program). 17 | $DIFF -w -u -b --label="$nleft" $LEFT --label "$nright" $RIGHT 18 | 19 | # Return an errorcode of 0 if no differences were detected, 1 if some were. 20 | # Any other errorcode will be treated as fatal. 21 | -------------------------------------------------------------------------------- /util/template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env pypy 2 | #-*- python -*- 3 | usage=""" 4 | the purpose of this script. 5 | """ 6 | 7 | from graehl import * 8 | from collections import defaultdict 9 | import os,sys 10 | #sys.path.append(os.path.dirname(sys.argv[0])) 11 | 12 | import optfunc 13 | @optfunc.arghelp('rest_','input files') 14 | def main(rest_=['-'],keyfields=1,sep='\t',usage_=usage): 15 | """-h usage""" 16 | logcmd(True) 17 | for f in rest_: 18 | for l in open_in(f): 19 | print sep.join(l.split(sep)[0:keyfields]) 20 | 21 | optfunc.main(main) 22 | -------------------------------------------------------------------------------- /util/test.grf: -------------------------------------------------------------------------------- 1 | wc -l giraffe 2 | wc -l giraffe.david 3 | wc -l giraffe.split 4 | wc -l libgraehl.pl 5 | wc -l test.grf 6 | -------------------------------------------------------------------------------- /util/unionfind.hh: -------------------------------------------------------------------------------- 1 | 2 | template 3 | struct UnionNode { 4 | Data data; 5 | mutable UnionNode *parent; 6 | U rank; 7 | UnionNode(Data const& data=Data()) : data(data), parent(this), rank() {} 8 | typedef UnionNode *Ptr; 9 | Ptr repr() const { 10 | if (parent != this) 11 | parent = parent->repr(); 12 | return parent; 13 | } 14 | Ptr merge(Ptr o) { 15 | return unionMergeRoots(repr(), o->repr()); 16 | } 17 | UnionNode & operator += (UnionNode& o) { 18 | return *merge(&o); 19 | } 20 | }; 21 | 22 | template 23 | PtrT unionMergeRoots(PtrT a, PtrT b) { 24 | if (a == b) return a; 25 | if (a->rank < b->rank) 26 | return a->parent = b; 27 | else { 28 | if (a->rank == b->rank) 29 | ++a->rank; 30 | return b->parent = a; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /util/unrpn_: -------------------------------------------------------------------------------- 1 | membersuffix=_ `dirname $0`/unrpn "$@" 2 | -------------------------------------------------------------------------------- /util/why-empty.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | use utf8; 4 | die 'why-empty.pl filea fileb' unless scalar @ARGV == 2; 5 | my $a = shift; 6 | my $b = shift; 7 | sub opened { 8 | my $f; 9 | open($f, $_[0]) or die "open $_[0]"; 10 | $f 11 | } 12 | my $af = &opened($a); 13 | my $bf = &opened($b); 14 | my @a = <$af>; 15 | my @b = <$bf>; 16 | my $an = scalar @a; 17 | my $bn = scalar @b; 18 | sub countwords { 19 | # my @a = split ' ',$_[0]; 20 | # scalar @a; 21 | return length shift; 22 | } 23 | my @difflines; 24 | die "#lines differ: $an != $bn for why-empty.pl $a $b" unless $an == $bn; 25 | for (0.. ($an - 1)) { 26 | my $al = $a[$_]; 27 | chomp $al; 28 | my $bl = $b[$_]; 29 | chomp $bl; 30 | my $ac = &countwords($al); 31 | my $bc = &countwords($bl); 32 | if (($ac == 0) != ($bc == 0)) { 33 | print "$_: $ac $bc ||| $al ||| $bl\n"; 34 | push @difflines, $_+1; 35 | } 36 | } 37 | my $ndiff = scalar @difflines; 38 | die "$ndiff differently empty lines. line-numbers: ".join(' ',@difflines) if $ndiff; 39 | -------------------------------------------------------------------------------- /util/yuminstall.txt: -------------------------------------------------------------------------------- 1 | haskell-platform 2 | scala 3 | emacs 4 | pychecker 5 | valgrind 6 | subversion 7 | git 8 | autogen 9 | cmake 10 | automake 11 | xz 12 | pypy 13 | tbb 14 | sbt 15 | maven 16 | nodejs 17 | flex 18 | texlive 19 | bison 20 | levien-inconsolata-fonts 21 | zlib-devel 22 | time 23 | scipy 24 | ppl 25 | clang-analyzer 26 | less 27 | boost 28 | cabal-install 29 | bind-utils 30 | bc 31 | R 32 | ccache 33 | condor 34 | readline-devel 35 | ndisc6 36 | traceroute 37 | pyOpenSSL 38 | patch 39 | cloog-ppl 40 | cppcheck 41 | upx 42 | tmux 43 | screen 44 | hg 45 | texlive-dejavu 46 | dejavu-sans-mono-fonts 47 | cabextract 48 | --------------------------------------------------------------------------------