├── README.md └── links_nlp.py /README.md: -------------------------------------------------------------------------------- 1 | # links-i-find-interesting 2 | A mirror for all the links I post on my personal discord server 3 | 4 | Links that I find important will have a star next to them. 5 | 6 | NOTE: This list is a mess, but I promise you the links are cool :) 7 | 8 | NOTE: This list is so long that it will take some time for it to be synced to my server 9 | 10 | ## Contents 11 | 12 | - [cool-ass-math](#cool-ass-math) 13 | - [math-blogs](#math-blogs) 14 | - [latex](#latex) 15 | - [cool-cs-theory](#cool-cs-theory) 16 | - [cs-general](#cs-general) 17 | - [exciting-cs-developments](#exciting-cs-developments) 18 | - [programming-languages](#programming-languages) 19 | - [low-level](#low-level) 20 | - [security-cryptography](#security-cryptography) 21 | - [computer-graphics](#computer-graphics) 22 | - [machine-learning](#machine-learning) 23 | - [cs-tech-concerns](#cs-tech-concerns) 24 | - [cs-tooling-or-sites](#cs-tooling-or-sites) 25 | - [cs-other](#cs-other) 26 | - [cs-courses](#cs-courses) 27 | - [tutorials](#tutorials) 28 | - [rust](#rust) 29 | - [cs-blogs](#cs-blogs) 30 | - [books](#books) 31 | - [memes](#memes) 32 | - [talks-videos](#talks-videos) 33 | - [career-and-student-resources](#career-and-student-resources) 34 | - [linguistics](#linguistics) 35 | 36 | --- 37 | 38 | ## cool-ass-math 39 | - https://stopa.io/post/269 (*) 40 | - http://www.math.wm.edu/~leemis/chart/UDR/UDR.html (*) 41 | - https://www.reddit.com/gallery/tfdd4t (*) 42 | - https://jmlr.csail.mit.edu/reviewing-papers/knuth_mathematical_writing.pdf (*) 43 | - http://www.lomont.org/papers/2003/InvSqrt.pdf 44 | - https://www.nature.com/articles/nature.2016.20491 45 | - https://people.math.harvard.edu/~shani/Teaching/141/141A-Notes.pdf 46 | - https://math.hawaii.edu/~grw/Classes/2009-2010/2009Fall/Math442_1/439notes.pdf 47 | - https://www.esat.kuleuven.be/cosic/projects/isocrypt/workshops/ 48 | - http://geometer.org/rubik/group.pdf 49 | - https://www.gametheory.net/ 50 | - https://planetmath.org/ 51 | - https://www.maths.ed.ac.uk/~v1ranick/papers/wigner.pdf 52 | - https://news.ycombinator.com/item?id=32281536 53 | - https://argumatronic.com/about.html 54 | - https://grossack.site/ 55 | - http://pling.jondgoodwin.com/post/rise-of-type-theory/ 56 | - https://cp4space.hatsya.com/2022/01/14/conway-conjecture-settled/ 57 | - http://brendanfong.com/programmingcats.html 58 | - http://zhat.io/articles/primer-probability-theory 59 | - https://homotopytypetheory.org/book/ 60 | - https://www.appliedcategorytheory.org/ 61 | - https://www.math.columbia.edu/~woit/wordpress/?p=3056 62 | - https://www.reddit.com/r/math/comments/txca2p/patterns_in_the_cartesian_plane_under_bitwise_xor/ 63 | - https://www.mathstat.dal.ca/~selinger/papers/lambdanotes.pdf 64 | - https://brilliant.org/wiki/lucas-theorem/ 65 | - https://maxima.sourceforge.io/ 66 | - https://bivector.net/doc.html 67 | - http://people.maths.ox.ac.uk/lackenby/tg050908.pdf 68 | - https://researchseminars.org/talk/ToposInstituteColloquium/57/ 69 | - https://andrewhead.info/assets/pdf/augmented-formulas.pdf 70 | - https://en.wikipedia.org/wiki/Brzozowski_derivative 71 | - https://uwo.ca/math/faculty/kapulkin/seminars/hottest_summer_school_2022.html 72 | - https://github.com/andrejbauer/homotopy-type-theory-course 73 | - https://matt.might.net/papers/might2011derivatives.pdf 74 | - https://www.andrew.cmu.edu/user/erijke/hott/hott_intro.pdf 75 | - https://www.cl.cam.ac.uk/~amp12/papers/catl/catl.pdf 76 | - https://1lab.dev/ 77 | - https://en.wikipedia.org/wiki/Glushkov%27s_construction_algorithm 78 | - https://zenodo.org/record/4457887#.YKPJcvaxVhE 79 | - https://arxiv.org/abs/2106.01484 80 | - https://math.rice.edu/~semmes/ (*) 81 | - https://www.theoremoftheday.org/ 82 | - https://en.wikipedia.org/wiki/Proof_of_Bertrand%27s_postulate 83 | - http://abstract.ups.edu/download/aata-20210809.pdf 84 | - https://www.quantamagazine.org/ 85 | - https://proofwiki.org/wiki/Category:Proofs 86 | - https://en.yna.co.kr/view/AEN20220705003951320 87 | - https://www.newyorker.com/culture/culture-desk/teaching-myself-calculus-at-sixty-five 88 | - https://www.azimuthproject.org/azimuth/show/Applied+Category+Theory+Course 89 | - https://math.stackexchange.com/questions/1232463/how-to-prove-the-language-of-all-binary-numbers-that-are-prime-is-nonregular-usi/1232511#1232511 90 | - https://github.com/leanprover-community/mathlib 91 | - https://www.ma.imperial.ac.uk/~buzzard/xena/natural_number_game/ 92 | - https://www.principiarewrite.com/ 93 | - https://www.theguardian.com/books/2022/aug/07/could-learning-algebra-in-my-60s-make-me-smarter-alec-wilkinson-a-divine-language-extract 94 | - https://twitter.com/naderi_yeganeh/status/1559914565797830656?s=21&t=NpK3hWYH0MKCBZ78FR5MsA 95 | - https://arxiv.org/pdf/2107.13242.pdf 96 | - https://users.cs.northwestern.edu/~riesbeck/proofs.html 97 | - https://madiot.fr/coq100/ 98 | - https://www.cs.uoregon.edu/research/summerschool/archives.html 99 | - http://www.cip.ifi.lmu.de/~grinberg/ 100 | - https://www.math.cmu.edu/~ploh/olympiad.shtml 101 | - https://ghoshadi.wordpress.com/ 102 | - https://forumgeom.fau.edu/index1.html 103 | - https://proofwiki.org/wiki/ProofWiki:Jokes 104 | - https://github.com/jack4818/maths-theses/blob/main/README.md 105 | - https://betterexplained.com/articles/intuitive-arithmetic-with-complex-numbers/ 106 | - https://betterexplained.com/articles/a-visual-intuitive-guide-to-imaginary-numbers/ 107 | - https://mathoverflow.net/questions/366765/issue-update-in-graph-theory-different-definitions-of-edge-crossing-numbers 108 | - https://www.reddit.com/r/math/comments/7gqhlc/what_to_say_instead_of_trivially/ 109 | - http://www.theproofistrivial.com/ 110 | - https://www.reddit.com/r/math/comments/5286ba/math_is_beautiful/ 111 | - https://www.reddit.com/r/math/comments/a0g2m2/shitpost_who_is_the_greatest_mathematician_of_all/ 112 | - https://realnotcomplex.com/ 113 | - https://people.math.harvard.edu/~shani/ 114 | - https://seis.bristol.ac.uk/~tz20861/ 115 | - https://sites.google.com/view/vantageseminar 116 | - https://arxiv.org/ftp/arxiv/papers/1303/1303.5965.pdf 117 | - https://www.galacticbeyond.com/music-noise/ 118 | - https://sites.google.com/view/sarahpetersen/home 119 | - https://rg1-teaching.mpi-inf.mpg.de/autrea-ws21/notes-current.pdf 120 | - https://mathoverflow.net/questions/286732/nonequivalent-definitions-in-mathematics/286751#286751 121 | - https://explained-from-first-principles.com/number-theory/ 122 | - https://injuly.in/blog/fourier-series/ 123 | - https://dozenal.org/drupal/sites_bck/default/files/db38206_0.pdf 124 | - https://florianstecker.de/Skripte/ 125 | - https://www.complexityexplorer.org/ 126 | - https://codeforces.com/blog/entry/69287 127 | - https://homepages.cwi.nl/~paulv/papers/info.pdf 128 | - https://math.stanford.edu/~vakil/216blog/ 129 | - https://www.zipcon.net/~swhite/docs/math/quaternions/associativity.html 130 | - https://thicketforte.com/ 131 | - https://arxiv.org/abs/1804.01193 132 | - https://www.people.vcu.edu/~rhammack/BookOfProof/ 133 | - https://aimath.org/textbooks/approved-textbooks/ 134 | - https://openlogicproject.org/ 135 | - https://dailynous.com/2018/11/07/new-free-open-source-multi-purpose-multi-system-logic-software/ 136 | - https://consequently.org/news/2017/twelve-things-i-love/ 137 | - https://lilypond.org/ 138 | - https://en.wikipedia.org/wiki/Permutoassociahedron 139 | - https://ps.uni-saarland.de/~forster/thesis.php 140 | - https://math.stackexchange.com/questions/64498/probability-that-two-random-numbers-are-coprime-is-frac6-pi2 141 | - https://arxiv.org/abs/1905.03014 142 | - https://arxiv.org/abs/0712.1320 143 | - https://www.cis.upenn.edu/~sweirich/papers/yorgey-thesis.pdf 144 | - https://www2.kenyon.edu/Depts/Math/Milnikel/boolos-godel.pdf 145 | - https://www.nku.edu/~longa/classes/mat385_resources/docs/russellpope.html 146 | - BOOK: Note on a Problem of Alan Sutcliffe 147 | - THESIS: Logic in Color A story and a language of category theory 148 | 149 | ### math-blogs 150 | - https://rjlipton.wpcomstaging.com/2022/04/05/blogs-that-are-current/ 151 | - https://11011110.github.io/blog/ 152 | - https://mathstrek.blog/ 153 | - https://www.math-only-math.com/math-blog.html 154 | - https://blogs.ams.org/matheducation/ 155 | - https://www.lesswrong.com/posts/EdFDwjsLNpgtTMJAp/great-mathematicians-on-math-competitions-and-genius 156 | - https://artofproblemsolving.com/community/c2202_math_blog_of_the_former_rising_olympian 157 | - https://proofmathisbeautiful.tumblr.com/ 158 | - https://intothecontinuum.tumblr.com/ 159 | - https://ldtopology.wordpress.com/ 160 | - https://sketchesoftopology.wordpress.com/ 161 | - https://isomorphismes.tumblr.com/tagged/mathematics 162 | - https://amathew.wordpress.com/ 163 | - https://www.futilitycloset.com/ 164 | - https://scottaaronson.blog/ 165 | - https://cameroncounts.wordpress.com/ 166 | - https://www.johndcook.com/blog/ 167 | - https://topologicalmusings.wordpress.com/ 168 | - https://davidlowryduda.com/blog/ 169 | - https://terrytao.wordpress.com/ 170 | - http://blog.tanyakhovanova.com/ 171 | - https://gowers.wordpress.com/ 172 | - https://mathblogging.org/ 173 | - https://rigtriv.wordpress.com/ 174 | - https://sbseminar.wordpress.com/ 175 | - https://unapologetic.wordpress.com/ 176 | - https://mathblog.com/ 177 | - https://jeremykun.com/ 178 | - https://plus.maths.org/content/ 179 | - https://gogeometry.com/ 180 | - https://lamington.wordpress.com/ 181 | - https://kskedlaya.org/geometryunbound/ 182 | - http://www.cut-the-knot.org/index.shtml 183 | - https://scoutmathematics.wordpress.com/ 184 | - http://bit-player.org/2022/the-middle-of-the-square 185 | - https://www.cut-the-knot.org/pythagoras/torque.shtml 186 | - https://math.berkeley.edu/~wu/ 187 | 188 | ### latex 189 | - https://aareyanmanzoor.github.io/Texromancers.html 190 | - https://raw.githubusercontent.com/vEnhance/dotfiles/main/texmf/tex/latex/evan/evan.sty 191 | - https://detexify.kirelabs.org/classify.html 192 | - https://mirrors.rit.edu/CTAN/macros/latex/contrib/annotate-equations/annotate-equations.pdf 193 | 194 | --- 195 | ## cool-cs-theory 196 | - https://avinayak.github.io/algorithms/programming/2021/02/19/finding-mona-lisa-in-the-game-of-life.html 197 | - https://www.youtube.com/watch?v=oEAa2pQKqQU 198 | - https://doisinkidney.com/posts/2021-03-14-hyperfunctions.html 199 | - https://www.tweag.io/blog/2020-04-23-deriving-isomorphically/ 200 | - https://counterexamples.org/ 201 | - https://www.reddit.com/r/compsci/comments/u0nvdv/artificial_life_simulation_dark_forest/?ref=share&ref_source=link 202 | - https://overreacted.io/algebraic-effects-for-the-rest-of-us/?utm_source=Morning+Cup+of+Coding&utm_campaign=f563363ee2-EMAIL_CAMPAIGN_2019_07_23_05_30&utm_medium=email&utm_term=0_56b5f64c5f-f563363ee2-56972441 203 | - http://liamoc.net/holbert/ 204 | - https://www.cs.bham.ac.uk/~axj/pub/papers/handy1.pdf 205 | - https://www.cs.cmu.edu/~fp/courses/15814-f19/lectures/ 206 | - https://paperswithcode.com/ 207 | - https://www.connectedpapers.com/ 208 | - https://theory.stanford.edu/~rayyli/teaching.html 209 | - https://yufeizhao.com/ 210 | - http://people.cs.uchicago.edu/~fortnow/papers/history.pdf 211 | - https://www.sciencedirect.com/science/article/pii/S0304397515001735 212 | - https://calmcode.io/blog/inverse-turing-test.html 213 | - https://maartenfokkinga.github.io/utwente/mmf91m.pdf 214 | - https://thume.ca/2017/06/17/tree-diffing/ 215 | - https://www.cs.cmu.edu/~qinsiw/thesis/q_wang_cs_2016.pdf 216 | - http://tom7.org/papers/ 217 | - https://crypto.stanford.edu/~blynn/lambda/ 218 | - https://www.hpl.hp.com/techreports/2003/HPL-2003-148.pdf 219 | - https://www.cl.cam.ac.uk/teaching/1617/L28/gadts.pdf 220 | 221 | ### cs-general 222 | - https://www.sas.upenn.edu/~cavitch/pdf-library/Nagel_Bat.pdf (*) 223 | - https://home.sandiego.edu/~baber/analytic/Lem1979.html (*) 224 | - https://www.cse.unr.edu/~sushil/class/ai/notes/papers/coffeehouse.html (*) 225 | - https://a-blog-with.relevant-information.com/posts/solving_hard_problems_with_intelligent_brute_force_search/ 226 | - http://wiki.c2.com/?GlobalVariablesAreBad 227 | - https://www.noulakaz.net/2007/03/18/a-regular-expression-to-check-for-prime-numbers/ 228 | - 229 | - https://arxiv.org/pdf/2203.16713.pdf 230 | - http://blog.kenficara.com/2013/06/30/irregular-language-and-regular-expressions/ 231 | - https://blog.burntsushi.net/transducers/ 232 | - https://peterellisjones.com/posts/generating-legal-chess-moves-efficiently/ 233 | 234 | ### exciting-cs-developments 235 | - https://www.quantamagazine.org/formal-verification-creates-hacker-proof-code-20160920/ 236 | - https://ai.googleblog.com/2022/04/pathways-language-model-palm-scaling-to.html 237 | - https://twitter.com/wasmerio/status/1232022730632904705 238 | - https://www.youtube.com/watch?v=sqNnN2Z4Lg4 239 | - https://openjdk.org/projects/amber/ 240 | - https://github.com/rust-lang/rust/issues/32838 241 | - https://mail.openjdk.org/pipermail/jdk-dev/2022-May/006549.html 242 | - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8aebac82933ff1a7c8eede18cab11e1115e2062b 243 | - https://jbaker.io/2022/05/09/project-loom-for-distributed-systems/ 244 | - https://www.tweag.io/blog/2022-11-22-wasm-backend-merged-in-ghc/ 245 | - https://kristoff.it/blog/zig-self-hosted-now-what/ 246 | - https://twitter.com/manfightdragon/status/1576768394707161088 247 | - https://twitter.com/LinaAsahi/status/1575343067892051968 248 | - https://archlinux.org/news/removing-python2-from-the-repositories/ 249 | - https://devblogs.microsoft.com/commandline/systemd-support-is-now-available-in-wsl/ 250 | - https://blog.torproject.org/arti_100_released/ 251 | - https://twitter.com/croloris/status/1550955321694330880?s=20&t=r091_jm83tj9MMYhnR_WFw 252 | - https://github.com/readme/featured/functional-programming 253 | - https://blog.rust-lang.org/2022/07/01/RLS-deprecation.html 254 | - https://vimhelp.org/version9.txt.html#new-9 255 | 256 | ### programming-languages 257 | - http://www.paulgraham.com/icad.html (*) 258 | - https://github.com/alhassy/PrologCheatSheet 259 | - https://github.com/golang-design/history 260 | - https://www.morling.dev/blog/loom-and-thread-fairness/ 261 | - https://gavinhoward.com/2022/04/i-believe-zig-has-function-colors/ 262 | - http://www.thyer.name/phd-thesis/thesis-thyer.pdf 263 | - https://www.pyret.org/index.html 264 | - https://matt.might.net/articles/parsing-with-derivatives/ 265 | - https://research.cs.queensu.ca/home/jana/papers/bidir-survey/Dunfield21_bidir-survey.pdf 266 | - https://www.pls-lab.org/en/Domain_Theory 267 | - https://blog.nindalf.com/posts/stop-citing-tiobe/ 268 | - https://blog.ironmansoftware.com/daily-powershell/bash-powershell-cheatsheet/ 269 | 270 | ### low-level 271 | - https://zeux.io/2022/01/08/on-proebstings-law/ 272 | - https://vikramoberoi.com/a-primer-on-roaring-bitmaps-what-they-are-and-how-they-work/ 273 | - https://www.davidsalomon.name/assem.advertis/asl.pdf 274 | - https://graphics.stanford.edu/~seander/bithacks.html 275 | 276 | ### security-cryptography 277 | - https://cronokirby.com/posts/2022/08/the-paper-that-keeps-showing-up/ 278 | - https://www.iacr.org/authors/tikz/ 279 | - https://arxiv.org/abs/1401.6488 280 | 281 | ### computer-graphics 282 | - https://iquilezles.org/ (*) 283 | - https://thebookofshaders.com/ (*) 284 | - https://www.realtimerendering.com/intersections.html (*) 285 | - https://epsln.github.io/blog/indraspearl_pt1/ (*) 286 | - https://coffeebeforearch.github.io/2020/06/23/mmul.html (*) 287 | - https://ninedegreesbelow.com/photography/xyz-rgb.html (*) 288 | - https://www.youtube.com/watch?v=GpsKrAipXm8 (*) 289 | - https://matlib.gpuopen.com/main/materials/all (*) 290 | - https://physicallybased.info/ (*) 291 | - https://www.scratchapixel.com/ (*) 292 | - https://www.gridbugs.org/wave-function-collapse/ 293 | - https://www.diva-portal.org/smash/get/diva2:1691141/FULLTEXT01.pdf 294 | - https://www.youtube.com/watch?v=d4EgbgTm0Bg 295 | - https://bauble.studio/ 296 | - https://nianticlabs.github.io/simplerecon/ 297 | - https://www.youtube.com/watch?v=yG4ChOPyC-4 298 | - https://www.youtube.com/watch?v=GswISjlquoU 299 | - https://www.youtube.com/watch?v=BFld4EBO2RE 300 | - https://medium.com/geekculture/decompiling-nvidia-shaders-and-optimizing-5aeaeb65f828 301 | - https://nothings.org/gamedev/rasterize/ 302 | - https://blog.selfshadow.com/ 303 | - https://jcgt.org/ 304 | - https://qoiformat.org/ 305 | - https://github.com/kaveh808/kons-9 306 | - https://mrl.cs.nyu.edu/projects/image-analogies/ 307 | - https://rosenzweig.io/blog/asahi-gpu-part-5.html 308 | - https://vksegfault.github.io/posts/gentle-intro-gpu-inner-workings/ 309 | - http://simonrodriguez.fr/dragon/ 310 | - https://adrianhesketh.com/2022/03/31/use-m1-gpu-with-go/ 311 | - https://www.cs.cornell.edu/courses/cs4620/2008fa/lectures/09pipeline.pdf 312 | - https://www.skytopia.com/project/fractal/2mandelbulb.html 313 | - https://www.youtube.com/watch?v=tX4H_ctggYo 314 | - https://github.com/rust-cv 315 | - http://kylehalladay.com/blog/2020/05/20/Rendering-With-Notepad.html 316 | - https://marctenbosch.com/quaternions/ 317 | - https://www.youtube.com/watch?v=hwaBaoAPOU0 318 | - https://www.duskborn.com/posts/a-simple-vulkan-compute-example/ 319 | - https://www.reddit.com/r/generative/top/ 320 | - https://www.reddit.com/r/proceduralgeneration/top/ 321 | - https://www.chaoticafractals.com/ 322 | - https://www.shadertoy.com/view/flKyzG 323 | 324 | ### machine-learning 325 | - https://www.cis.upenn.edu/~jean/math-deep.pdf (*) 326 | - https://truyentran.github.io/phd.html (*) 327 | - https://www.nature.com/articles/nature14539 (*) 328 | - https://www.inference.org.uk/itprnn/book.pdf (*) 329 | - https://jalammar.github.io/illustrated-stable-diffusion/ (*) 330 | - https://github.com/labmlai/annotated_deep_learning_paper_implementations (*) 331 | - https://bair.berkeley.edu/blog/2022/05/20/crosswords/ 332 | - https://www.probabilistic-numerics.org/textbooks/ 333 | - https://arxiv.org/pdf/2108.02497.pdf (*) 334 | - https://mml-book.github.io/ 335 | - https://towardsdatascience.com/stable-diffusion-best-open-source-version-of-dall-e-2-ebcdf1cb64bc 336 | - https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf 337 | - https://arxiv.org/pdf/1601.06733.pdf 338 | - https://thegradient.pub/graph-neural-networks-beyond-message-passing-and-weisfeiler-lehman/ 339 | - http://neuralnetworksanddeeplearning.com/chap1.html 340 | - https://arxiv.org/abs/2003.14169 341 | - https://www.youtube.com/watch?v=Lq-Y7crQo44 342 | - https://spectrum.ieee.org/deep-learning-computational-cost 343 | - https://github.com/vaaaaanquish/Awesome-Rust-MachineLearning 344 | - https://joe-antognini.github.io/ml/consciousness 345 | - https://github.com/MrinmoiHossain/Reinforcement-Learning-Specialization-Coursera/blob/master/Book/Reinforcement%20Learning%20An%20introduction%20(Second%20Edition)%20by%20Richard%20S.%20Sutton%20and%20Andrew%20G.%20Barto.pdf 346 | - https://www.joelsimon.net/evo_floorplans.html 347 | - https://web.stanford.edu/class/psych209/Readings/SuttonBartoIPRLBook2ndEd.pdf 348 | - https://pub.towardsai.net/stable-diffusion-based-image-compresssion-6f1f0a399202 349 | 350 | ### cs-blogs 351 | - https://www.ruder.io/ 352 | - http://blog.cleancoder.com/uncle-bob/2015/11/18/TheProgrammersOath.html 353 | - https://miles.land/ 354 | - http://www.paulgraham.com/gh.html 355 | - https://lucatrevisan.wordpress.com/ 356 | - https://prog21.dadgum.com/ 357 | 358 | 359 | ### cs-tech-concerns 360 | - https://prog21.dadgum.com/154.html (*) 361 | - http://www.paulgraham.com/pypar.html (*) 362 | - https://cor3ntin.github.io/posts/abi/ (*) 363 | - https://www.netmeister.org/blog/plagiarism.html (*) 364 | - http://www.paulgraham.com/gba.html (*) 365 | - https://pg.ucsd.edu/publications/Microsoft-Windows-which-bugs-get-reassigned_CSCW-2011.pdf (*) 366 | - https://pg.ucsd.edu/publications/Microsoft-Windows-which-bugs-get-reassigned_CSCW-2011.pdf (*) 367 | - https://github.com/srevinsaju/Firefox-Appimage/issues/26 (*) 368 | - https://www.youtube.com/watch?v=bOzaimWuWec (*) 369 | - https://github.com/uBlockOrigin/uBlock-issues/issues/338 (*) 370 | - https://artemis.sh/2022/08/21/this-program-is-illegally-packaged-in-14-distributions.html (*) 371 | - https://dkb.blog/p/google-search-is-dying 372 | - https://www.nature.com/articles/d41586-022-01516-2 373 | - https://corecursive.com/040-tech-evangelism-with-gabriel-gonzalez/ 374 | - https://9to5mac.com/2022/05/25/duckduckgo-privacy-microsoft-permission-tracking/ 375 | - https://www.quantamagazine.org/computing-expert-says-programmers-need-more-math-20220517/ 376 | - https://www.fsf.org/licensing/copilot 377 | - https://www.theatlantic.com/technology/archive/2014/08/advertising-is-the-internets-original-sin/376041/ 378 | - https://rally.mozilla.org/current-studies/ 379 | - https://www.reddit.com/r/TikTokCringe/comments/w1vte4/cs_students_showing_how_anyone_can_be_misogynistic/ 380 | - https://kotaku.com/unity-john-riccitiello-monetization-mobile-ironsource-1849179898 381 | - https://docs.github.com/en/code-security/secret-scanning/secret-scanning-partner-program 382 | - https://velvetshark.com/articles/why-do-brands-change-their-logos-and-look-like-everyone-else 383 | - http://yosefk.com/blog/redundancy-vs-dependencies-which-is-worse.html 384 | - https://www.businessinsider.com/nearly-half-genz-use-tiktok-instagram-over-google-search-2022-7 385 | - https://techbeacon.com/app-dev-testing/test-production-yes-you-can-you-should 386 | - https://web.stanford.edu/class/cs240/old/sp2014/readings/worse-is-better.html 387 | - https://www.pcgamer.com/hacker-jailbreaks-control-unit-that-stops-farmers-repairing-their-tractors-then-runs-doom-on-it/ 388 | - https://ghuntley.com/fracture/ 389 | - https://aestheticsforbirds.com/2021/02/12/the-house-never-loses-how-microtransactions-exploit-video-game-players/ 390 | - https://cfenollosa.com/blog/after-self-hosting-my-email-for-twenty-three-years-i-have-thrown-in-the-towel-the-oligopoly-has-won.html 391 | - https://twitter.com/stephenlacy/status/1554697077430505473 392 | - https://interviewing.io/blog/google-facebook-hiring-freeze 393 | - https://www.theguardian.com/technology/2022/sep/05/i-didnt-want-it-anywhere-near-me-how-the-apple-airtag-became-a-gift-to-stalkers 394 | - https://arstechnica.com/tech-policy/2022/09/ntsb-wants-alcohol-detection-systems-installed-in-all-new-cars-in-us/ 395 | - https://dejanmarketing.com/competitor-hack/ 396 | - https://www.abc.net.au/news/2022-11-14/former-prisoner-struggling-with-the-use-of-technology/101641072 397 | - https://stopscanningme.eu/en/ 398 | - https://www.youtube.com/watch?v=WC1kPtG8Iz8 399 | - https://www.vice.com/en/article/pkgma8/police-are-using-dna-to-generate-3d-images-of-suspects-theyve-never-seen 400 | - https://iai.tv/articles/an-irrational-world-camus-auid-1578&utm_source=reddit&_auid=2020 401 | - https://www.consumerreports.org/electronics-computers/privacy/tiktok-tracks-you-across-the-web-even-if-you-dont-use-app-a4383537813/ 402 | - https://mashable.com/article/meta-facebook-ai-chatbot-racism-donald-trump 403 | 404 | 405 | ### cs-tooling-or-sites 406 | - https://shorterlife.github.io/challenge/ (*) 407 | - https://penrose.ink/ 408 | - https://cdn.discordapp.com/attachments/530604566794862612/965494123878551592/Pinouts_V0.3.pdf 409 | - http://www.phrack.org/issues/7/3.html 410 | - https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home 411 | - https://github.com/vadimdemedes/ink 412 | - https://www.getzola.org/documentation/getting-started/overview/ 413 | - https://dirk.rave.org/combinatris/ 414 | - https://makeavideo.studio/ 415 | - https://tauri.app/ 416 | 417 | ### cs-other 418 | - https://btm.qva.mybluehost.me/building-arbitrary-life-patterns-in-15-gliders/ (*) 419 | - https://www.inkandswitch.com/end-user-programming/ (*) 420 | - https://utf8everywhere.org/ (*) 421 | - https://github.com/corollari/linusrants (*) 422 | - https://mrotherguy.github.io/ToyfoCSS/ 423 | - https://www.ma.imperial.ac.uk/~buzzard/xena/natural_number_game/ 424 | - https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.222.6996&rep=rep1&type=pdf 425 | - https://nedbatchelder.com/text/unipain.html 426 | - http://http://http://@http://http://?http://#http:// 427 | - https://github.com/girliemac/a-picture-is-worth-a-1000-words 428 | - https://archivesspace.mit.edu/repositories/2/resources/1305 429 | - https://practicaltypography.com/ 430 | - https://dustri.org/b/horrible-edge-cases-to-consider-when-dealing-with-music.html 431 | - https://medium.com/@sdboyer/so-you-want-to-write-a-package-manager-4ae9c17d9527 432 | - https://www.expasy.org/ 433 | - https://prog21.dadgum.com/149.html 434 | - https://world.hey.com/dhh/programmers-should-stop-celebrating-incompetence-de1a4725 435 | - https://github.com/csurfer/pyheat 436 | - https://nlp.stanford.edu/projects/glove/ 437 | - https://gwern.net/co2-coin 438 | 439 | ### cs-courses 440 | - https://www.cs.uic.edu/~jbell/CourseNotes/OperatingSystems/ 441 | - https://github.com/system-f/fp-course 442 | - https://www.cs.rice.edu/~as143/COMP480_580_Fall22/index.html 443 | - https://github.com/dddrrreee/cs240lx-22spr/blob/main/labs/README.md 444 | - https://github.com/dddrrreee/cs140e-20win 445 | - https://student.cs.uwaterloo.ca/~cs452/S22/ 446 | - https://peterellisjones.com/posts/generating-legal-chess-moves-efficiently/ 447 | - https://plfa.github.io/ 448 | - https://leanprover-community.github.io/learn.html 449 | - https://www.cis.upenn.edu/~cis1940/fall16/ 450 | 451 | ### tutorials and other learning resources 452 | - https://mukulrathi.com/create-your-own-programming-language/intro-to-type-checking/ 453 | - http://aosabook.org/en/llvm.html 454 | - https://github.com/jbrkr/Category_Theory_Natural_Language_Processing_NLP 455 | - https://github.com/MattPD/cpplinks 456 | - https://cirosantilli.com/x86-paging 457 | - https://doc.rust-lang.org/stable/rust-by-example/ 458 | - https://github.com/chassing/linux-sysadmin-interview-questions 459 | - https://www.leshenko.net/p/ugit/# 460 | - https://github.com/codecrafters-io/build-your-own-x 461 | - https://vim.rtorr.com/ 462 | - https://kean.blog/post/lets-build-regex 463 | - https://g-w1.github.io/blog/zig/low-level/2021/03/15/elf-linux.html 464 | - https://forem.julialang.org/wikfeldt/a-brief-tour-of-julia-for-high-performance-computing-5deb 465 | - https://pragprog.com/titles/rshaskell/effective-haskell/ 466 | - https://www.cis.upenn.edu/~cis1940/spring13/lectures/01-intro.html 467 | - https://scipy-lectures.org/advanced/advanced_numpy/index.html#advanced-numpy 468 | - https://hal.inria.fr/inria-00564007/document 469 | - https://blog.paperspace.com/numpy-optimization-vectorization-and-broadcasting/ 470 | - https://github.com/rougier/scientific-visualization-book 471 | - https://jfmc.github.io/z3-play/ 472 | - https://ordep.dev/posts/my-favorite-papers 473 | 474 | ### rust 475 | - https://rust-unofficial.github.io/patterns/ (*) 476 | - https://dylanj.xyz/posts/rust-coq-opaque-types/ (*) 477 | - https://github.com/rust-lang/rust/issues/84056#issuecomment-1184725924 (*) 478 | - https://itsallaboutthebit.com/arc-mutex/ (*) 479 | - https://blog.yoshuawuyts.com/unsafe-syntax/ (*) 480 | - https://recursion.wtf/posts/rust_schemes/ (*) 481 | - https://recursion.wtf/posts/rust_schemes_2/ (*) 482 | - https://sdleffler.github.io/RustTypeSystemTuringComplete/ (*) 483 | - https://www.thecodedmessage.com/posts/2022-07-14-programming-unwrap/ (*) 484 | - https://zdimension.fr/how-i-learned-to-stop-worrying-and-love-macros/ (*) 485 | - https://nnethercote.github.io/perf-book/title-page.html (*) 486 | - https://rust-lang.github.io/api-guidelines/about.html (*) 487 | - https://cheats.rs/ 488 | - https://wishawa.github.io/posts/thread-scoped-async/ 489 | - https://aturon.github.io/blog/2015/09/18/reuse/ 490 | - http://cosmic.mearie.org/2014/01/periodic-table-of-rust-types/ 491 | - https://blog.m-ou.se/rust-cpp-concurrency/ 492 | - https://cglab.ca/%7Eabeinges/blah/rust-btree-case/ 493 | - https://fasterthanli.me/articles/proc-macro-support-in-rust-analyzer-for-nightly-rustc-versions 494 | - https://github.com/Byron/gitoxide 495 | - https://github.com/Kindelia/HVM 496 | - https://github.com/RWTH-OS/eduOS-rs 497 | - https://github.com/RayMarch/ferris3d 498 | - https://github.com/charliermarsh/ruff 499 | - https://github.com/dtolnay/trybuild 500 | - https://github.com/hermitcore/rusty-hermit 501 | - https://github.com/huhu/rust-search-extension 502 | - https://github.com/kaist-cp/cs220 503 | - https://github.com/matklad/cargo-xtask 504 | - https://github.com/mgattozzi/whorl 505 | - https://github.com/rulex-rs/pomsky 506 | - https://github.com/rust-lang/rust/issues/43122 507 | - https://github.com/rust-unofficial/awesome-rust 508 | - https://github.com/tlepoint/fhe.rs 509 | - https://github.com/yoav-lavi/melody 510 | - https://github.com/zesterer/chumsky 511 | - https://neosmart.net/blog/2022/implementing-truly-safe-semaphores-in-rust/ 512 | - https://people.kernel.org/linusw/rust-in-perspective 513 | - https://robert.kra.hn/posts/2022-09-09-speeding-up-incremental-rust-compilation-with-dylibs/ 514 | - https://rust-lang.github.io/rfcs/3191-debugger-visualizer.html#summary 515 | - https://towardsdatascience.com/nine-rules-for-writing-python-extensions-in-rust-d35ea3a4ec29 516 | - https://www.possiblerust.com/pattern/non-generic-inner-functions 517 | - https://www.reddit.com/r/rust/comments/w521tx/cpp_for_rust_users/ 518 | - https://www.youtube.com/watch?time_continue=2&v=-l-8WrGHEGI&feature=emb_title 519 | - https://www.youtube.com/watch?v=Ak7kDY3_eMI 520 | - https://youtu.be/6-8-9ZV-2WQ 521 | - https://youtu.be/iuY4CTCmClg 522 | - https://www.crowdstrike.com/blog/data-science-test-drive-of-rust-programming-language/ 523 | - https://github.com/rust-ml/linfa 524 | - https://github.com/Rust-GPU/Rust-CUDA 525 | - https://github.com/tensorflow/rust 526 | - https://able.bio/haixuanTao/data-manipulation-polars-vs-rust--3def44c8 527 | - https://www.orchest.io/blog/the-great-python-dataframe-showdown-part-3-lightning-fast-queries-with-polars 528 | 529 | --- 530 | 531 | ## books 532 | - https://booksdrive.org/wp-content/uploads/2022/03/The-Pragmatic-Programmer-by-Andrew-Hunt-David-Hurst-Thomas.pdf (*) 533 | - https://ibm.github.io/neuro-symbolic-ai/events/ns-summerschool2022/ (*) 534 | - https://hackernewsbooks.com/ 535 | - https://divisbyzero.com/tales-of-impossibility/ 536 | - https://softwarefoundations.cis.upenn.edu/ 537 | - https://algebradriven.design/ 538 | - https://thinkingwithtypes.com/ 539 | - https://mitpress.mit.edu/9780262045490/ 540 | - https://github.com/joeycastillo/The-Open-Book 541 | - https://github.com/hackerkid/Mind-Expanding-Books 542 | - https://github.com/hmemcpy/milewski-ctfp-pdf/ 543 | - https://standardebooks.org/ 544 | - http://learnyouahaskell.com/ 545 | - 99 Variations on a Proof 546 | - https://the-eye.eu/public/Books/ 547 | - https://scalawagmagazine.org/2021/09/banned-books-texas-prisons/ 548 | - https://americanlibrariesmagazine.org/blogs/the-scoop/prisoners-pay-to-read-prison-tablets/ 549 | - https://codersatwork.com/ 550 | - https://link.springer.com/book/10.1007/978-0-387-48278-1 551 | - https://dev.realworldocaml.org/index.html 552 | - https://www.gutenberg.org/ 553 | 554 | --- 555 | ## talks-videos 556 | - https://www.youtube.com/watch?v=W4Mcuh38wyM (*) 557 | - https://www.youtube.com/watch?v=f6Dh5NjlZMk 558 | - https://www.youtube.com/watch?v=JH_Ou17_zyU 559 | - https://www.youtube.com/watch?v=KH8z1IbXelk 560 | - https://www.youtube.com/playlist?list=PL0OBHndHAAZrGQEkOZGyJu7S7KudAJ8M9 561 | - https://www.youtube.com/playlist?list=PLK_sH5jbkYciCyOTllsGyHVcHErHhtnZZ 562 | - https://www.youtube.com/c/AlphaPhoenixChannel 563 | - https://www.youtube.com/watch?v=50zPH63AdUA 564 | - https://www.youtube.com/playlist?list=PLi01XoE8jYoi3SgnnGorR_XOW3IcK-TP6 565 | - https://www.youtube.com/playlist?list=PLL61h44ln0J0Pbs2EPR71wn-8wvwxHI9z 566 | - https://www.reddit.com/r/theydidthemath/comments/vizp1l/offsite_folding_a_piece_of_paper_42_times_would/ 567 | - https://www.youtube.com/watch?v=bOXCLR3Wric 568 | - https://www.youtube.com/watch?v=UJp4q2D2Nh0 569 | - https://www.youtube.com/watch?v=uyS1cXrsgIg 570 | - https://www.reddit.com/r/DHExchange/comments/wsi1gd/sharing_my_archive_of_richard_borcherdss_lectures/ 571 | - https://www.youtube.com/watch?v=KhfZK5IIK9E 572 | - https://www.youtube.com/watch?v=3gyHKCDq1YA 573 | - https://www.youtube.com/watch?v=dwNxVpbEVcc 574 | - https://www.youtube.com/watch?v=8x374slJGuo 575 | - https://www.youtube.com/watch?v=zR_hpai3XkY 576 | - https://www.youtube.com/watch?v=88BA8aO3qXA 577 | - https://www.youtube.com/watch?v=icrXmYHnU9E 578 | - https://www.youtube.com/watch?v=9syvZr-9xwk&list=PLUl4u3cNGP60_JNv2MmK3wkOt9syvfQWY 579 | - https://www.youtube.com/c/t3ssel8r 580 | --- 581 | 582 | ### memes 583 | - https://github.com/zhuowei/nft_ptr 584 | - https://blog.plover.com/prog/burritos.html 585 | - https://leanprover.github.io/ 586 | - https://rudism.com/vim-creep/ 587 | - https://regexcrossword.com/ 588 | - http://jimbly.github.io/regex-crossword/ 589 | - https://stackoverflow.com/questions/17279712/what-is-the-smallest-possible-valid-pdf/17280876#17280876 590 | 591 | 592 | --- 593 | 594 | 595 | ## career-and-student-resources 596 | - https://www.eecs.harvard.edu/htk/phdadvice/#1 (*) 597 | - https://karpathy.github.io/2016/09/07/phd/ (*) 598 | - https://matt.might.net/articles/what-cs-majors-should-know/ (*) 599 | - https://web.stanford.edu/class/ee384m/Handouts/HowtoReadPaper.pdf (*) 600 | - https://jsomers.net/i-should-have-loved-biology/ (*) 601 | - https://www.cs.toronto.edu/~miller/resources.html 602 | - https://github.com/mtdvio/every-programmer-should-know 603 | - https://cs.stanford.edu/people/widom/paper-writing.html 604 | - https://matt.might.net/articles/how-to-apply-and-get-in-to-graduate-school-in-science-mathematics-engineering-or-computer-science/ 605 | - https://conquer.cra.org/students/applying 606 | - https://reufinder.com/ 607 | - https://graduate.dartmouth.edu/student-support/career-services/interview-practice-techniques/interviewing-academic-job 608 | - https://refactoring.guru/design-patterns/catalog 609 | - https://ibm.github.io/neuro-symbolic-ai/events/ns-summerschool2022/ 610 | - https://www.redblobgames.com/ 611 | - https://hackingcpp.com/cpp/cheat_sheets.html 612 | - http://www.paulgraham.com/college.html 613 | - https://noted.lol/back-to-school-self-hosted-edition/ 614 | - https://pub.towardsai.net/make-your-matplotlib-plots-stand-out-using-this-cheat-sheet-8c666de90433 615 | - https://prog21.dadgum.com/190.html 616 | - http://www.paulgraham.com/noob.html 617 | - https://prog21.dadgum.com/80.html 618 | - https://blog.rust-lang.org/inside-rust/2022/04/19/imposter-syndrome.html 619 | - https://nomadlist.com/ 620 | - https://prog21.dadgum.com/210.html 621 | - https://www.thinkful.com/blog/why-learning-to-code-is-so-damn-hard/ 622 | - https://overreacted.io/things-i-dont-know-as-of-2018/ 623 | - https://www.natolambert.com/writing/ai-phd-job-hunt 624 | - https://engineering.virginia.edu/internships-and-summer-experiences-1st-and-2nd-years 625 | - https://cims.nyu.edu/ai/educational-programs/pathways-ai/ 626 | 627 | --- 628 | ## linguistics 629 | - https://restofworld.org/2021/bringing-urdu-into-the-digital-age/ 630 | - https://github.com/gentaiscool/code-switching-papers 631 | - https://blog.juliosong.com/linguistics/mathematics/a-new-application-of-category-theory-in-linguistics-part-1/#top 632 | - https://golem.ph.utexas.edu/category/2018/02/linguistics_using_category_the.html 633 | - https://johncarlosbaez.wordpress.com/2018/02/11/linguistics-using-category-theory/ 634 | - https://www.denizcemonduygu.com/philo/browse/ 635 | -------------------------------------------------------------------------------- /links_nlp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | import requests 5 | import spacy 6 | from bs4 import BeautifulSoup 7 | from collections import defaultdict, Counter 8 | from urllib.parse import urlparse 9 | from spacy.matcher import PhraseMatcher 10 | from concurrent.futures import ThreadPoolExecutor 11 | from sklearn.feature_extraction.text import TfidfVectorizer 12 | from multiprocessing import cpu_count 13 | from io import BytesIO 14 | from pdfminer.high_level import extract_text 15 | from requests.adapters import HTTPAdapter 16 | from urllib3.util.retry import Retry 17 | import urllib3 18 | import browser_cookie3 19 | from loguru import logger 20 | from sys import stdout 21 | 22 | # Set up Loguru logging with color. 23 | logger.remove() 24 | 25 | logger.add(lambda msg: print(msg, end=""), level="INFO", colorize=True, 26 | format="{time:YYYY-MM-DD HH:mm:ss} {message}") 27 | 28 | # Define unwanted tags to filter out. 29 | unwanted_tags = { 30 | "connect", 31 | "connect world", 32 | "world communities", 33 | "account", 34 | "sign", 35 | "sign in", 36 | "seriously available", 37 | "seriously", 38 | "continue forms", 39 | "forms forgot", 40 | "suspicious", 41 | "does", 42 | "suspicious report", 43 | "look suspicious", 44 | "form look", 45 | "look", 46 | "form", 47 | "report", 48 | "does form", 49 | "want", 50 | } 51 | 52 | # Check for GPU and enable it for spaCy (Apple silicon: uses MPS if available) 53 | try: 54 | spacy.require_gpu() 55 | logger.info("GPU enabled for spaCy!") 56 | except Exception as e: 57 | logger.info("GPU not available. Running on CPU. {}", e) 58 | 59 | # Load transformer-based model (more accurate, heavier) 60 | nlp = spacy.load("en_core_web_trf") 61 | matcher = PhraseMatcher(nlp.vocab) 62 | 63 | # Disable insecure request warnings. 64 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 65 | 66 | # Define topic-based phrase matching patterns. 67 | topics = { 68 | "cool-ass-math": ["math", "arxiv", "proof", "theorem", "set-theory"], 69 | "math-blogs": ["math-blog", "mathematical writing", "math discussion"], 70 | "latex": ["latex", "tex", "typography"], 71 | "cs-general": ["computer science", "computing", "cs fundamentals"], 72 | "exciting-cs-developments": ["new technology", "latest cs research", "tech breakthrough"], 73 | "programming-languages": ["language design", "compiler", "syntax", "parsing"], 74 | "low-level": ["assembly", "bitwise", "performance optimization"], 75 | "security-cryptography": ["security", "encryption", "cybersecurity", "hashing"], 76 | "computer-graphics": ["graphics", "rendering", "ray tracing", "shaders"], 77 | "machine-learning": ["ml", "deep learning", "neural network", "ai"], 78 | "cs-tech-concerns": ["tech ethics", "big tech", "privacy"], 79 | "cs-tooling-or-sites": ["github", "vim", "neovim", "emacs", "editor"], 80 | "cs-other": ["miscellaneous cs", "interesting cs topics"], 81 | "cs-courses": ["cs education", "computer science courses", "learning cs"], 82 | "tutorials": ["tutorial", "how-to", "beginner guide"], 83 | "rust": ["rustlang", "rust programming"], 84 | "cs-blogs": ["cs-blog", "tech blog", "programming insights"], 85 | "books": ["book", "reading", "library"], 86 | "memes": ["meme", "funny", "tenor", "gif"], 87 | "talks-videos": ["youtube", "twitch", "talk", "lecture", "conference"], 88 | "career-and-student-resources": ["internship", "career", "resume", "student"], 89 | "linguistics": ["phonetics", "syntax", "morphology", "linguistics"], 90 | "machine-learning-ai": ["ml", "deep learning", "neural network", "ai"], 91 | "rice-stuff": ["Rice", "campus", "general", "Rice University"], 92 | "urandom": ["random", "generator", "unix", "entropy"], 93 | "cryptocurrency": ["crypto", "blockchain", "bitcoin", "ethereum", "altcoin"], 94 | "cs-questions": ["computer science", "questions", "problems", "discussion"], 95 | "cs-theory": ["theory", "algorithms", "computation theory", "formal methods", "cs", "computing"], 96 | "hooman-languages": ["language", "human", "linguistics", "communication"], 97 | "im-trying-to-learn": ["learning", "self-improvement", "education", "tutorial"], 98 | "linux": ["linux", "unix", "operating system", "opensource"], 99 | "student-resources": ["student", "resources", "education", "college", "learning"], 100 | "general": ["general", "miscellaneous", "discussion", "variety"], 101 | "life-things": ["life", "personal", "stories", "experiences"], 102 | "books-and-linguistics": ["book", "reading", "library", "phonetics", "syntax", "morphology", "linguistics"], 103 | "competitive-programming": ["competitive programming", "coding competitions", "algorithms", "challenges"], 104 | "text-editors": ["text editor", "vim", "emacs", "sublime", "editor"], 105 | } 106 | 107 | # The allowed (predefined) topics are the keys from the topics dictionary. 108 | allowed_topics = set(topics.keys()) 109 | 110 | # Add phrase matching patterns. 111 | for category, phrases in topics.items(): 112 | patterns = [nlp(text) for text in phrases] 113 | matcher.add(category, patterns) 114 | 115 | def validate_url(url): 116 | """Clean the URL and ensure it has a proper scheme.""" 117 | url = url.strip().strip('"').strip("'").strip("<>") 118 | parsed = urlparse(url) 119 | if parsed.scheme not in ("http", "https"): 120 | return None 121 | return url 122 | 123 | def clean_text(text): 124 | """Normalize whitespace and strip extra spaces.""" 125 | if text: 126 | return " ".join(text.split()) 127 | return "" 128 | 129 | def sanitize_filename(name): 130 | """Replaces any invalid filename characters with an underscore.""" 131 | return re.sub(r'[\\/*?:"<>|]', "_", name) 132 | 133 | def setup_http_session(): 134 | """Creates a session with a retry strategy and loads cookies from Firefox.""" 135 | session = requests.Session() 136 | try: 137 | session.cookies = browser_cookie3.firefox() 138 | logger.info("Loaded cookies from Firefox!") 139 | except Exception as e: 140 | logger.warning("Could not load Firefox cookies: {}", e) 141 | retries = Retry( 142 | total=3, 143 | backoff_factor=2, 144 | status_forcelist=[500, 502, 503, 504], 145 | raise_on_status=False, 146 | ) 147 | adapter = HTTPAdapter(max_retries=retries, pool_connections=50, pool_maxsize=50) 148 | session.mount("http://", adapter) 149 | session.mount("https://", adapter) 150 | # Use headers similar to Firefox to avoid 406 errors. 151 | session.headers.update({ 152 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:112.0) Gecko/20100101 Firefox/112.0", 153 | "Accept": "*/*", 154 | "Accept-Language": "en-US,en;q=0.9", 155 | "Accept-Encoding": "gzip, deflate", 156 | "Connection": "keep-alive" 157 | }) 158 | return session 159 | 160 | def extract_links(filename): 161 | """Extracts and validates links from the given file.""" 162 | with open(filename, "r", encoding="utf-8") as file: 163 | for line in file: 164 | for url in re.findall(r"https?://\S+", line): 165 | valid = validate_url(url) 166 | if valid: 167 | yield valid 168 | 169 | def sort_links_by_domain(links): 170 | """Sorts links by domain for optimized session reuse.""" 171 | return sorted(links, key=lambda url: urlparse(url).netloc) 172 | 173 | def fetch_page_content(url, session, retries=3): 174 | """Fetches content from a webpage or PDF.""" 175 | for attempt in range(retries): 176 | try: 177 | response = session.get(url, timeout=5, stream=True, verify=False) 178 | response.raise_for_status() 179 | content_type = response.headers.get("Content-Type", "").lower() 180 | if "pdf" in content_type or url.endswith(".pdf"): 181 | return extract_pdf_text(response) 182 | soup = BeautifulSoup(response.text, "html.parser") 183 | title = soup.title.string.strip() if soup.title else "No Title" 184 | paragraphs = soup.find_all("p") 185 | text_content = " ".join(clean_text(p.get_text()) for p in paragraphs[:15]) 186 | return clean_text(title), text_content if text_content else None 187 | except requests.RequestException as e: 188 | logger.warning("Request error for {}: {}", url, e) 189 | if attempt < retries - 1: 190 | continue 191 | return None, None 192 | 193 | def extract_pdf_text(response): 194 | """Extracts text from a PDF response. 195 | 196 | First, it attempts to use pdfminer; if that fails, it falls back on PyMuPDF (fitz). 197 | """ 198 | try: 199 | with BytesIO(response.content) as pdf_file: 200 | pdf_text = extract_text(pdf_file).strip() 201 | if pdf_text: 202 | return "PDF Document", clean_text(pdf_text) 203 | else: 204 | raise ValueError("No text extracted using pdfminer.") 205 | except Exception as e: 206 | logger.warning("PDF extraction error using pdfminer: {}", e) 207 | try: 208 | import fitz # PyMuPDF 209 | with fitz.open(stream=response.content, filetype="pdf") as doc: 210 | text = "" 211 | for page in doc: 212 | text += page.get_text() 213 | if text: 214 | return "PDF Document", clean_text(text) 215 | else: 216 | raise ValueError("No text extracted using PyMuPDF.") 217 | except Exception as e2: 218 | logger.error("Fallback PDF extraction error: {}", e2) 219 | return "Unreadable PDF", None 220 | 221 | def compute_global_tfidf(texts): 222 | """Builds a global TF-IDF model on all document texts.""" 223 | vectorizer = TfidfVectorizer( 224 | stop_words="english", 225 | max_features=500, 226 | ngram_range=(1, 2), 227 | min_df=2, 228 | ) 229 | tfidf_matrix = vectorizer.fit_transform(texts) 230 | return tfidf_matrix, vectorizer 231 | 232 | def extract_top_keywords_from_vector(row, vectorizer, n=10, min_length=3): 233 | """Extracts top N keywords from a TF-IDF row filtering by minimum length.""" 234 | row_array = row.toarray().flatten() 235 | top_indices = row_array.argsort()[::-1][:n] 236 | feature_names = vectorizer.get_feature_names_out() 237 | top_keywords = [ 238 | feature_names[i] 239 | for i in top_indices 240 | if row_array[i] > 0 and len(feature_names[i]) >= min_length 241 | ] 242 | return top_keywords 243 | 244 | def domain_based_categories(url): 245 | """Assigns categories based on the URL's domain.""" 246 | domain = urlparse(url).netloc 247 | categories = set() 248 | if "github.com" in domain: 249 | categories.add("cs-tooling-or-sites") 250 | elif "arxiv.org" in domain: 251 | categories.add("cool-ass-math") 252 | elif "youtube.com" in domain: 253 | categories.add("talks-videos") 254 | return categories 255 | 256 | def process_all_links(links): 257 | """Fetches page content concurrently using ThreadPoolExecutor.""" 258 | results = [] 259 | session = setup_http_session() 260 | with ThreadPoolExecutor(max_workers=20) as executor: 261 | futures = {executor.submit(fetch_page_content, link, session): link for link in links} 262 | for future in futures: 263 | link = futures[future] 264 | title, content = future.result() 265 | if title and content: 266 | results.append((link, title, content)) 267 | return results 268 | 269 | def batch_nlp_processing(docs): 270 | """ 271 | Processes texts in batch using spaCy's pipe. 272 | When using GPU with en_core_web_trf, n_process is forced to 1. 273 | """ 274 | n_process = 1 if spacy.prefer_gpu() else cpu_count() 275 | return list(nlp.pipe(docs, n_process=n_process)) 276 | 277 | def filter_unwanted_tags(tags, unwanted): 278 | """Return a new set of tags, filtering out any that contain unwanted phrases.""" 279 | filtered = set() 280 | for tag in tags: 281 | tag_clean = tag.lower().strip() 282 | if any(unwanted_phrase in tag_clean for unwanted_phrase in unwanted): 283 | continue 284 | filtered.add(tag) 285 | return filtered 286 | 287 | def categorize_documents(results, processed_docs, tfidf_matrix, vectorizer): 288 | """ 289 | For each link, compute assigned tags using NER, phrase matching, 290 | TF-IDF, and domain heuristics. Then, select the top 25 dynamic categories 291 | that are not in allowed topics and not unwanted. Finally, for each link, 292 | keep only tags that are either predefined or among these top new dynamic tags. 293 | """ 294 | dynamic_categories = Counter() 295 | link_categories = [] 296 | 297 | for i, (link, title, content) in enumerate(results): 298 | doc = processed_docs[i] 299 | assigned = set() 300 | 301 | # NER-based: include allowed labels. 302 | for ent in doc.ents: 303 | if ent.label_ in ["EVENT", "WORK_OF_ART", "LAW", "NORP", "LANGUAGE", "ORG", "GPE", "PERSON"]: 304 | assigned.add(ent.text.lower()) 305 | 306 | # Phrase matching. 307 | matches = matcher(doc) 308 | for match_id, start, end in matches: 309 | assigned.add(nlp.vocab.strings[match_id]) 310 | 311 | # Global TF-IDF keywords. 312 | top_keywords = extract_top_keywords_from_vector(tfidf_matrix[i], vectorizer, n=10) 313 | assigned.update(top_keywords) 314 | for kw in top_keywords: 315 | dynamic_categories[kw] += 1 316 | 317 | # Domain-based heuristics. 318 | assigned.update(domain_based_categories(results[i][0])) 319 | 320 | link_categories.append((link, title, assigned)) 321 | 322 | # Compute top 25 dynamic categories not in allowed topics and not unwanted. 323 | top_new = {tag for tag, count in dynamic_categories.most_common(25) 324 | if tag not in allowed_topics and tag not in unwanted_tags} 325 | 326 | # For each link, filter its tags. 327 | categorized_links = defaultdict(list) 328 | for link, title, assigned in link_categories: 329 | filtered_tags = filter_unwanted_tags(assigned, unwanted_tags) 330 | final_tags = {tag for tag in filtered_tags if tag in allowed_topics or tag in top_new} 331 | if not final_tags: 332 | final_tags.add("miscellaneous") 333 | for tag in final_tags: 334 | categorized_links[tag].append({ 335 | "url": link, 336 | "title": title, 337 | "tags": list(final_tags) 338 | }) 339 | 340 | return categorized_links, dynamic_categories 341 | 342 | def save_categorized_links(categorized_links): 343 | """Saves categorized links to JSON files in the 'categorized_links' folder.""" 344 | os.makedirs("categorized_links", exist_ok=True) 345 | for category, links in categorized_links.items(): 346 | safe_category = sanitize_filename(category) 347 | filepath = os.path.join("categorized_links", f"{safe_category}.json") 348 | with open(filepath, "w", encoding="utf-8") as file: 349 | json.dump(links, file, indent=4) 350 | 351 | def test_pdf_output(source): 352 | """ 353 | Tests PDF extraction. If 'source' is a URL (starting with "http"), 354 | it downloads the PDF; otherwise, it treats 'source' as a local file path. 355 | """ 356 | logger.info("Testing PDF extraction for: {}", source) 357 | try: 358 | if source.lower().startswith("http"): 359 | response = requests.get(source, timeout=10, verify=False) 360 | response.raise_for_status() 361 | content = response.content 362 | else: 363 | with open(source, "rb") as f: 364 | content = f.read() 365 | # Create a dummy response-like object with a 'content' attribute. 366 | DummyResponse = type("DummyResponse", (object,), {"content": content}) 367 | title, text = extract_pdf_text(DummyResponse()) 368 | logger.info("Extracted title: {}", title) 369 | if text: 370 | logger.info("Extracted text (first 500 chars):\n{}", text[:500]) 371 | else: 372 | logger.error("No text extracted from PDF.") 373 | except Exception as e: 374 | logger.error("Error testing PDF extraction: {}", e) 375 | 376 | def main(): 377 | filename = "links.txt" # Update this to point to your links file. 378 | links = list(extract_links(filename)) 379 | sorted_links = sort_links_by_domain(links) 380 | logger.info("Total links to process: {}", len(sorted_links)) 381 | 382 | # Fetch page content concurrently. 383 | fetched_results = process_all_links(sorted_links) 384 | if not fetched_results: 385 | logger.error("No links fetched successfully.") 386 | return 387 | 388 | # Combine title and content for NLP processing. 389 | docs = [f"{title} {content}" for _, title, content in fetched_results] 390 | docs = [clean_text(doc) for doc in docs] 391 | 392 | # Batch process documents using spaCy. 393 | processed_docs = batch_nlp_processing(docs) 394 | 395 | # Use full content for global TF-IDF. 396 | texts = [clean_text(content) for _, _, content in fetched_results] 397 | tfidf_matrix, vectorizer = compute_global_tfidf(texts) 398 | 399 | # Categorize documents. 400 | categorized_links, dynamic_categories = categorize_documents( 401 | fetched_results, processed_docs, tfidf_matrix, vectorizer 402 | ) 403 | 404 | # Save categorized links. 405 | save_categorized_links(categorized_links) 406 | # Filter dynamic categories for logging. 407 | filtered_dynamic = Counter({tag: count for tag, count in dynamic_categories.items() if tag not in unwanted_tags}) 408 | filtered_top_new = [(tag, count) for tag, count in dynamic_categories.most_common(25) 409 | if tag not in allowed_topics and tag not in unwanted_tags] 410 | 411 | logger.info("Categorization complete. Check the 'categorized_links' folder.") 412 | logger.info("Dynamic categories (filtered): {}", filtered_dynamic.most_common(10)) 413 | logger.info("Top new dynamic categories (filtered): {}", filtered_top_new) 414 | 415 | if __name__ == "__main__": 416 | import sys 417 | if len(sys.argv) > 1 and sys.argv[1] == "testpdf": 418 | source = sys.argv[2] if len(sys.argv) > 2 else "sample.pdf" 419 | test_pdf_output(source) 420 | else: 421 | main() 422 | --------------------------------------------------------------------------------