├── requirements.txt ├── data ├── raw │ ├── allwords_a.pkl │ ├── allwords_b.pkl │ ├── allwords_c.pkl │ ├── allwords_d.pkl │ ├── allwords_e.pkl │ ├── allwords_f.pkl │ ├── allwords_g.pkl │ ├── allwords_h.pkl │ ├── allwords_i.pkl │ ├── allwords_j.pkl │ ├── allwords_k.pkl │ ├── allwords_l.pkl │ ├── allwords_m.pkl │ ├── allwords_n.pkl │ ├── allwords_o.pkl │ ├── allwords_p.pkl │ ├── allwords_q.pkl │ ├── allwords_r.pkl │ ├── allwords_s.pkl │ ├── allwords_t.pkl │ ├── allwords_u.pkl │ ├── allwords_v.pkl │ ├── allwords_w.pkl │ ├── allwords_x.pkl │ ├── allwords_y.pkl │ ├── allwords_z.pkl │ ├── allwords_á.pkl │ ├── allwords_é.pkl │ ├── allwords_í.pkl │ ├── allwords_ñ.pkl │ ├── allwords_ó.pkl │ ├── allwords_ú.pkl │ ├── allwords_ü.pkl │ ├── allwords_termina_a.pkl │ ├── allwords_termina_b.pkl │ ├── allwords_termina_c.pkl │ ├── allwords_termina_d.pkl │ ├── allwords_termina_e.pkl │ ├── allwords_termina_f.pkl │ ├── allwords_termina_g.pkl │ ├── allwords_termina_h.pkl │ ├── allwords_termina_i.pkl │ ├── allwords_termina_j.pkl │ ├── allwords_termina_k.pkl │ ├── allwords_termina_l.pkl │ ├── allwords_termina_m.pkl │ ├── allwords_termina_n.pkl │ ├── allwords_termina_o.pkl │ ├── allwords_termina_p.pkl │ ├── allwords_termina_q.pkl │ ├── allwords_termina_r.pkl │ ├── allwords_termina_s.pkl │ ├── allwords_termina_t.pkl │ ├── allwords_termina_u.pkl │ ├── allwords_termina_v.pkl │ ├── allwords_termina_w.pkl │ ├── allwords_termina_x.pkl │ ├── allwords_termina_y.pkl │ ├── allwords_termina_z.pkl │ ├── allwords_termina_á.pkl │ ├── allwords_termina_é.pkl │ ├── allwords_termina_í.pkl │ ├── allwords_termina_ñ.pkl │ ├── allwords_termina_ó.pkl │ ├── allwords_termina_ú.pkl │ └── allwords_termina_ü.pkl └── archive │ └── 2024-05-22 │ ├── 0_subfijos.txt │ └── 0_prefijos.txt ├── src ├── __pycache__ │ └── helpers.cpython-312.pyc ├── spliter.sh ├── reorder.py ├── starting_letter.sh ├── length.sh ├── post.py ├── post_process.py ├── rae_downloader.py └── helpers.py └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | lxml 2 | argparse 3 | -------------------------------------------------------------------------------- /data/raw/allwords_a.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_a.pkl -------------------------------------------------------------------------------- /data/raw/allwords_b.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_b.pkl -------------------------------------------------------------------------------- /data/raw/allwords_c.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_c.pkl -------------------------------------------------------------------------------- /data/raw/allwords_d.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_d.pkl -------------------------------------------------------------------------------- /data/raw/allwords_e.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_e.pkl -------------------------------------------------------------------------------- /data/raw/allwords_f.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_f.pkl -------------------------------------------------------------------------------- /data/raw/allwords_g.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_g.pkl -------------------------------------------------------------------------------- /data/raw/allwords_h.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_h.pkl -------------------------------------------------------------------------------- /data/raw/allwords_i.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_i.pkl -------------------------------------------------------------------------------- /data/raw/allwords_j.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_j.pkl -------------------------------------------------------------------------------- /data/raw/allwords_k.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_k.pkl -------------------------------------------------------------------------------- /data/raw/allwords_l.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_l.pkl -------------------------------------------------------------------------------- /data/raw/allwords_m.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_m.pkl -------------------------------------------------------------------------------- /data/raw/allwords_n.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_n.pkl -------------------------------------------------------------------------------- /data/raw/allwords_o.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_o.pkl -------------------------------------------------------------------------------- /data/raw/allwords_p.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_p.pkl -------------------------------------------------------------------------------- /data/raw/allwords_q.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_q.pkl -------------------------------------------------------------------------------- /data/raw/allwords_r.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_r.pkl -------------------------------------------------------------------------------- /data/raw/allwords_s.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_s.pkl -------------------------------------------------------------------------------- /data/raw/allwords_t.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_t.pkl -------------------------------------------------------------------------------- /data/raw/allwords_u.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_u.pkl -------------------------------------------------------------------------------- /data/raw/allwords_v.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_v.pkl -------------------------------------------------------------------------------- /data/raw/allwords_w.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_w.pkl -------------------------------------------------------------------------------- /data/raw/allwords_x.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_x.pkl -------------------------------------------------------------------------------- /data/raw/allwords_y.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_y.pkl -------------------------------------------------------------------------------- /data/raw/allwords_z.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_z.pkl -------------------------------------------------------------------------------- /data/raw/allwords_á.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_á.pkl -------------------------------------------------------------------------------- /data/raw/allwords_é.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_é.pkl -------------------------------------------------------------------------------- /data/raw/allwords_í.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_í.pkl -------------------------------------------------------------------------------- /data/raw/allwords_ñ.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_ñ.pkl -------------------------------------------------------------------------------- /data/raw/allwords_ó.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_ó.pkl -------------------------------------------------------------------------------- /data/raw/allwords_ú.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_ú.pkl -------------------------------------------------------------------------------- /data/raw/allwords_ü.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_ü.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_a.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_a.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_b.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_b.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_c.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_c.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_d.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_d.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_e.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_e.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_f.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_f.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_g.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_g.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_h.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_h.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_i.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_i.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_j.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_j.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_k.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_k.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_l.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_l.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_m.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_m.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_n.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_n.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_o.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_o.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_p.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_p.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_q.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_q.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_r.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_r.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_s.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_s.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_t.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_t.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_u.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_u.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_v.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_v.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_w.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_w.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_x.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_x.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_y.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_y.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_z.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_z.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_á.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_á.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_é.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_é.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_í.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_í.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_ñ.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_ñ.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_ó.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_ó.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_ú.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_ú.pkl -------------------------------------------------------------------------------- /data/raw/allwords_termina_ü.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_ü.pkl -------------------------------------------------------------------------------- /src/__pycache__/helpers.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/src/__pycache__/helpers.cpython-312.pyc -------------------------------------------------------------------------------- /data/archive/2024-05-22/0_subfijos.txt: -------------------------------------------------------------------------------- 1 | -acanto 2 | -bia 3 | -bio 4 | -cardio 5 | -céfala 6 | -céfalo 7 | -cito 8 | -dáctilo 9 | -dermo 10 | -ferro 11 | -fita 12 | -fito 13 | -lito 14 | -morfa 15 | -morfo 16 | -zoo 17 | -------------------------------------------------------------------------------- /src/spliter.sh: -------------------------------------------------------------------------------- 1 | #ª/bin/bash 2 | 3 | LETRAS=('a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'ñ' 'o' 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z') 4 | 5 | cat palabras_todas.txt | grep -v '.*-$' | grep -v ^- | sort | uniq > 0_palabras_todas.txt 6 | cat palabras_todas.txt | grep '.*-$' | sort | uniq > 0_prefijos.txt 7 | cat palabras_todas.txt | grep ^- | sort | uniq > 0_subfijos.txt 8 | -------------------------------------------------------------------------------- /src/reorder.py: -------------------------------------------------------------------------------- 1 | import time 2 | import argparse 3 | import pickle 4 | import pyuca 5 | import urllib.request 6 | 7 | collator = pyuca.Collator("src/allkeys.txt") 8 | 9 | palabras = [] 10 | 11 | with open("data/0_palabras_todas.txt", 'r') as f: 12 | for line in f: 13 | palabras.append (line.strip()) 14 | 15 | palabras = sorted(list(set(palabras)), key=collator.sort_key) 16 | 17 | with open("data/0_palabras_todas_sorted.txt", 'w') as f: 18 | for palabra in palabras: 19 | f.write(palabra + '\n') 20 | -------------------------------------------------------------------------------- /src/starting_letter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # By: @hasecilu, some lines were extracted from spliter.sh file 4 | 5 | if [ ${PWD##*/} = "diccionario-espanol-txt" ] 6 | then 7 | mkdir -pv starting_letter 8 | LETRAS=('a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'ñ' 'o' 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z') 9 | 10 | for l in "${LETRAS[@]}"; do 11 | cat 0_palabras_todas.txt | grep ^$l > starting_letter/$l.txt 12 | done 13 | 14 | wc -l starting_letter/* 15 | else 16 | echo "Go to the diccionario-espanol-txt folder" 17 | fi 18 | 19 | -------------------------------------------------------------------------------- /src/length.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # By: @hasecilu 4 | 5 | if [ ${PWD##*/} = "diccionario-espanol-txt" ] 6 | then 7 | mkdir -pv length 8 | 9 | FILENAME="0_palabras_todas.txt" 10 | LINES=$(cat $FILENAME) 11 | 12 | COUNTER=0 13 | 14 | for LINE in $LINES 15 | do 16 | if [ ${#LINE} -lt 10 ] 17 | then 18 | echo $LINE >> ./length/0${#LINE}.txt 19 | else 20 | echo $LINE >> ./length/${#LINE}.txt # time: 7.01s user 2.54s system 99% cpu 9.595 total 21 | # echo $LINE | tee -a ./length/${#LINE}.txt # time: 1263.37s user 3400.90s system 131% cpu 58:60.00 total 22 | fi 23 | 24 | COUNTER=$((COUNTER+1)) 25 | done 26 | 27 | wc -l ./length/* 28 | else 29 | echo "Go to the diccioanrio-espanol-folder" 30 | fi 31 | 32 | -------------------------------------------------------------------------------- /src/post.py: -------------------------------------------------------------------------------- 1 | import time 2 | import argparse 3 | import pickle 4 | import pyuca 5 | import urllib.request 6 | 7 | parser = argparse.ArgumentParser(description='RAE Process data.') 8 | parser.add_argument('--inputfile', metavar='outfile no extension', type=str, default="data/allwords") 9 | parser.add_argument('--outputfile', metavar='outputfile', type=str, default="data/allwords.txt") 10 | args = parser.parse_args() 11 | 12 | 13 | letras = ['a', 'á', 'b', 'c', 'd', 'e', 'é', 'f', 'g', 'h', 'i', 'í', 'j', 'k', 'l', 'm', 14 | 'n', 'ñ', 'o', 'ó', 'p', 'q', 'r', 's', 't', 'u', 'ú', 'ü', 'v', 'w', 'x', 'y', 'z'] 15 | 16 | collator = pyuca.Collator() 17 | 18 | 19 | palabras = [] 20 | 21 | for l in letras: 22 | with open(f"{args.inputfile}_{l}.pkl", 'rb') as f: 23 | words = pickle.load(f) 24 | keys = words.keys() 25 | palabras += keys 26 | 27 | 28 | palabras = sorted(list(set(palabras)), key=collator.getSortKey) 29 | 30 | 31 | with open(args.outputfile, 'w') as f: 32 | for palabra in palabras: 33 | f.write(palabra + '\n') 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /src/post_process.py: -------------------------------------------------------------------------------- 1 | import time 2 | import argparse 3 | import pickle 4 | import pyuca 5 | import urllib.request 6 | 7 | parser = argparse.ArgumentParser(description='RAE Process data.') 8 | parser.add_argument('--inputfile', metavar='outfile no extension', type=str, default="data/raw/allwords") 9 | parser.add_argument('--termina', default="", type=str) 10 | parser.add_argument('--outputfile', metavar='outputfile', type=str, default="data/allwords") 11 | args = parser.parse_args() 12 | 13 | def save_file(lista, file): 14 | with open(file, 'w') as f: 15 | for item in lista: 16 | f.write(item + '\n') 17 | 18 | 19 | letras = ['a', 'á', 'b', 'c', 'd', 'e', 'é', 'f', 'g', 'h', 'i', 'í', 'j', 'k', 'l', 'm', 20 | 'n', 'ñ', 'o', 'ó', 'p', 'q', 'r', 's', 't', 'u', 'ú', 'ü', 'v', 'w', 'x', 'y', 'z'] 21 | 22 | 23 | if args.termina != "": 24 | termina = "data/raw/allwords_termina" 25 | else: 26 | termina = None 27 | 28 | collator = pyuca.Collator("src/allkeys.txt") 29 | 30 | palabras = [] 31 | 32 | for l in letras: 33 | with open(f"{args.inputfile}_{l}.pkl", 'rb') as f: 34 | words = pickle.load(f) 35 | keys = words.keys() 36 | palabras += keys 37 | if termina: 38 | with open(f"{termina}_{l}.pkl", 'rb') as f: 39 | words = pickle.load(f) 40 | keys = words.keys() 41 | palabras += keys 42 | 43 | 44 | 45 | palabras = sorted(list(set(palabras)), key=collator.sort_key) 46 | 47 | #paraborrar = ["(impersonal:", "(solo", ")"] 48 | #for borrar in paraborrar: 49 | # palabras.remove(borrar) 50 | 51 | 52 | save_file(palabras, f"{args.outputfile}.txt") 53 | 54 | "-", "" 55 | 56 | """ 57 | 58 | 59 | 60 | 61 | longitudes = {} 62 | 63 | for palabra in palabras: 64 | longitud = len(palabra) 65 | if longitud in longitudes: 66 | longitudes[longitud] += palabra 67 | else: 68 | longitudes[longitud] = [palabra] 69 | 70 | for longitud in longitudes: 71 | longitudes[longitud] = sorted(longitudes[longitud], key=collator.sort_key) 72 | 73 | """ 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # List of all spanish words 2 | 3 | This project has gained a lot of attention from researchers and application developers. I think that this information should be provided by the RAE organization. Meanwhile you can find the information here. 4 | 5 | > Updated with RAE server in: 2025-02-10 6 | 7 | ## Data layout 8 | 9 | ```txt 10 | \- src: python source code 11 | \- data 12 | \- analysis (WorkInProgress) 13 | \- clean (WorkInProgress) 14 | \- meanings (WorkInProgress) 15 | \- raw 16 | \- archive 17 | ``` 18 | 19 | ## Running 20 | 21 | Steps: 22 | 23 | 1. install requeriments 24 | 2. run web scrapper (src/rae_downloader.py) saved as pickle files 25 | 3. run post process ( convert to txt, sort, cleaning, etc.) 26 | 27 | 28 | 29 | 30 | 31 | ## Outdated information. 32 | 33 | Usage 34 | ``` 35 | usage: rae_downloader.py [-h] [--conjugaciones] [--skip-conjugaciones] 36 | [--outfile outfile] 37 | [--outfile outfile] 38 | 39 | RAE Downloader. 40 | 41 | optional arguments: 42 | -h, --help show this help message and exit 43 | --conjugaciones 44 | --skip-conjugaciones 45 | --outfile outfile 46 | ``` 47 | 48 | Words in file has no order and can be duplicades: 49 | 50 | ``` 51 | cat palabras_todas.txt | grep -v '.*-$' | grep -v ^- | sort | uniq > 0_palabras_todas.txt 52 | --outfile outfile 53 | ``` 54 | 55 | Words in file has no order and can be duplicades: 56 | 57 | ``` 58 | cat palabras_todas.txt | grep -v '.*-$' | grep -v ^- | sort | uniq > 0_palabras_todas.txt 59 | ``` 60 | 61 | ### Classify words by their length 62 | 63 | The `0_palabras_todas.txt` file is needed. 64 | 65 | Inside the `diccionario-espanol-txt` folder and running the `length.sh` file will create the `length` folder with the words classified by its length. 66 | 67 | ``` 68 | bash src/length.sh 69 | ``` 70 | 71 | ### Classify words by their first letter 72 | 73 | The `0_palabras_todas.txt` file is needed. 74 | 75 | 76 | Due to the lack of `palabras_todas.txt` file (creating it will last so many hours) the `spliter.sh` file will not work. So this script works with the `0_palabras_todas.txt` file. 77 | 78 | Inside the `diccionario-espanol-txt` folder and running the `starting_letter.sh` file will create the `starting_letter` folder with the words classified by the first letter. 79 | 80 | ``` 81 | bash src/starting_letter.sh 82 | ``` 83 | 84 | 85 | ## Conjugaciones 86 | 87 | 88 | ## Remember 89 | 90 | Doble check after download: 91 | 92 | - There is words starting by á, é, etc. 93 | - Check plurals: gato, gata, gatos, gatas. 94 | 95 | ## Changelog 96 | 97 | 2024-10-20: 98 | - Some variable names typos corrected 99 | - Try to get plurals 100 | - Verifica ababílla 101 | -------------------------------------------------------------------------------- /data/archive/2024-05-22/0_prefijos.txt: -------------------------------------------------------------------------------- 1 | a- 2 | acanto- 3 | acro- 4 | acui- 5 | ad- 6 | adeno- 7 | aero- 8 | afro- 9 | agro- 10 | alo- 11 | alti- 12 | ana- 13 | anarco- 14 | andro- 15 | anemo- 16 | anfi- 17 | angio- 18 | anglo- 19 | aniso- 20 | ante- 21 | anti- 22 | antropo- 23 | apico- 24 | arbori- 25 | archi- 26 | arqueo- 27 | arqui- 28 | astro- 29 | atto- 30 | audio- 31 | austro- 32 | auto- 33 | baro- 34 | bi- 35 | biblio- 36 | bio- 37 | bis- 38 | biz- 39 | bradi- 40 | cardio- 41 | cata- 42 | cefalo- 43 | centi- 44 | cian- 45 | ciano- 46 | ciber- 47 | circun- 48 | cis- 49 | cito- 50 | clepto- 51 | co- 52 | con- 53 | contra- 54 | cosmo- 55 | crio- 56 | cripto- 57 | cromo- 58 | crono- 59 | cuadr- 60 | cuadri- 61 | cuadru- 62 | cuasi- 63 | cuatri- 64 | dactilo- 65 | de- 66 | deca- 67 | deci- 68 | demo- 69 | denti- 70 | dento- 71 | derm- 72 | dermat- 73 | dermato- 74 | dermo- 75 | des- 76 | di- 77 | dia- 78 | dis- 79 | dodeca- 80 | e- 81 | eco- 82 | ecto- 83 | electro- 84 | en- 85 | endeca- 86 | endo- 87 | enea- 88 | eno- 89 | entero- 90 | entre- 91 | equi- 92 | eritro- 93 | es- 94 | esclero- 95 | estereo- 96 | etno- 97 | euro- 98 | ex- 99 | exa- 100 | exo- 101 | extra- 102 | fago- 103 | femto- 104 | ferro- 105 | filo- 106 | fisio- 107 | fito- 108 | fono- 109 | foto- 110 | franco- 111 | galacto- 112 | galo- 113 | gamo- 114 | gastero- 115 | gastro- 116 | geo- 117 | germano- 118 | geronto- 119 | giga- 120 | gineco- 121 | gluco- 122 | grafo- 123 | halo- 124 | hecto- 125 | helico- 126 | helio- 127 | hema- 128 | hemato- 129 | hemi- 130 | hemo- 131 | hepato- 132 | hepta- 133 | hetero- 134 | hexa- 135 | hidro- 136 | higro- 137 | hiper- 138 | hipo- 139 | hispano- 140 | histo- 141 | holo- 142 | homeo- 143 | homo- 144 | ibero- 145 | in- 146 | indo- 147 | infra- 148 | inmuno- 149 | inter- 150 | intra- 151 | islamo- 152 | iso- 153 | italo- 154 | kili- 155 | kilo- 156 | leuco- 157 | linfo- 158 | lipo- 159 | lito- 160 | macro- 161 | magneto- 162 | masto- 163 | maxi- 164 | mega- 165 | meso- 166 | meta- 167 | micro- 168 | mili- 169 | mini- 170 | mio- 171 | miria- 172 | mono- 173 | morfo- 174 | moto- 175 | muco- 176 | multi- 177 | nano- 178 | narco- 179 | necro- 180 | nefro- 181 | neo- 182 | neumo- 183 | neuro- 184 | nitro- 185 | nor- 186 | nord- 187 | octa- 188 | octo- 189 | oligo- 190 | onco- 191 | onto- 192 | ornito- 193 | orto- 194 | osteo- 195 | paleo- 196 | pan- 197 | para- 198 | pato- 199 | penta- 200 | per- 201 | peri- 202 | peta- 203 | pico- 204 | piro- 205 | pluri- 206 | podo- 207 | poli- 208 | pos- 209 | pre- 210 | pro- 211 | proto- 212 | psico- 213 | ptero- 214 | quimio- 215 | quiro- 216 | radio- 217 | re- 218 | requete- 219 | res- 220 | rete- 221 | retro- 222 | rino- 223 | rizo- 224 | sarco- 225 | seleno- 226 | semi- 227 | sero- 228 | servo- 229 | sesqui- 230 | seudo- 231 | sin- 232 | so- 233 | sobre- 234 | socio- 235 | son- 236 | sota- 237 | soto- 238 | sub- 239 | sud- 240 | super- 241 | supra- 242 | sur- 243 | tanato- 244 | taqui- 245 | tardo- 246 | tecno- 247 | tele- 248 | tera- 249 | termo- 250 | tetra- 251 | trans- 252 | tri- 253 | tribo- 254 | turbo- 255 | ultra- 256 | uni- 257 | vi- 258 | vice- 259 | video- 260 | viz- 261 | xeno- 262 | xero- 263 | xilo- 264 | zoo- 265 | -------------------------------------------------------------------------------- /src/rae_downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Desarrollado por Jorge Dueñas Lerín 4 | 5 | from urllib.parse import quote 6 | from urllib.request import Request, urlopen 7 | from lxml import etree 8 | 9 | import time 10 | import argparse 11 | import pickle 12 | 13 | from helpers import get_xtree, try_conjugacion, try_plural, try_me_siento_con_suerte, url_list_empieza, url_list_termina, skip 14 | 15 | 16 | parser = argparse.ArgumentParser(description='RAE Downloader.') 17 | parser.add_argument('--ix', metavar='ix', type=int, required=True, help='Start with this letter index') 18 | parser.add_argument('--termina', dest='termina', action='store_true') 19 | parser.add_argument('--conjugaciones', action='store_true') 20 | parser.add_argument('--skip-conjugaciones', dest='conjugaciones', action='store_false') 21 | parser.set_defaults(conjugaciones=True) 22 | parser.add_argument('--plurals', default=True) 23 | parser.add_argument('--outfile', metavar='outfile no extension', type=str, default="data/raw/allwords") 24 | args = parser.parse_args() 25 | 26 | 27 | letras = ['a', 'á', 'b', 'c', 'd', 'e', 'é', 'f', 'g', 'h', 'i', 'í', 'j', 'k', 'l', 'm', 28 | 'n', 'ñ', 'o', 'ó', 'p', 'q', 'r', 's', 't', 'u', 'ú', 'ü', 'v', 'w', 'x', 'y', 'z'] 29 | #letras = ['s', 'i', 'í'] 30 | letras_count = len(letras) 31 | start = letras[args.ix] 32 | print(f"Running with {args.ix}/{letras_count}: {start}") 33 | start_with = [start] 34 | dict_dump = {} 35 | 36 | if args.termina: 37 | url_list = url_list_termina 38 | else: 39 | url_list = url_list_empieza 40 | 41 | NITEMS=20 42 | 43 | def procesa(palabras): 44 | # Se repiten palabras. Cuando por ejemplo aba tiene más de 30 y se exapande 45 | # abaa, abab, etc... las primeras palabras no aparecen: aba 46 | numpal = len(palabras) 47 | for ix, pal in enumerate(palabras): 48 | if pal.startswith(","): 49 | print("Tratada antes", pal) 50 | continue 51 | 52 | if ix+1 < numpal and palabras[ix+1].startswith(","): 53 | pal = pal + palabras[ix+1] 54 | 55 | print(pal) 56 | dict_dump[pal] = pal 57 | 58 | """ 59 | This code is comented. It is not update with the last version of the RAE website. 60 | TODO. 61 | 62 | if ", " not in pal_clean: 63 | pal_list.append(pal_clean) 64 | else: 65 | pal_clean = pal_clean.split(", ") 66 | for pal_clean_multi in pal_clean: 67 | pal_list.append(pal_clean_multi) 68 | """ 69 | """ 70 | for pal_ix in pal_list: 71 | 72 | #if args.conjugaciones: 73 | # try_conjugacion(pal_ix, dict_dump) 74 | # try_plural(pal_ix, dict_dump) 75 | """ 76 | 77 | 78 | while len(start_with) != 0: 79 | palabra_start_with = start_with.pop(0) 80 | 81 | if(palabra_start_with in ['app', 'docs', 'js']): # RAE servers do not like this 82 | continue 83 | 84 | try_me_siento_con_suerte(palabra_start_with, dict_dump) 85 | 86 | tree = get_xtree(url_list, palabra_start_with) 87 | pags = tree.xpath('//*/*[@class="c-pagination"]/*/text()') 88 | 89 | res = tree.xpath('//*/article/h3/a/text()') 90 | procesa(res) 91 | 92 | if pags: 93 | npags = max([int(x,0) for x in pags if x.isdigit()]) 94 | print("Hay páginas") 95 | for page in range(npags): 96 | if page == 0: 97 | continue 98 | print("Página: " + str(page)) 99 | fparam = page*NITEMS 100 | 101 | tree = get_xtree(url_list, palabra_start_with, fparam) 102 | res = tree.xpath('//*/article/h3/a/text()') 103 | res = res + tree.xpath('//*/article/h3/a/i/text()') 104 | procesa(res) 105 | 106 | else: 107 | print("No hay páginas") 108 | 109 | if pags: 110 | print("!" * 80) 111 | print("EXAPEND: " + palabra_start_with) 112 | expand = [palabra_start_with + l for l in letras] 113 | start_with = expand + start_with 114 | 115 | 116 | pickle.dump(dict_dump, open(f"{args.outfile}_{start}.pkl", "wb")) 117 | -------------------------------------------------------------------------------- /src/helpers.py: -------------------------------------------------------------------------------- 1 | import time 2 | import argparse 3 | import pickle 4 | 5 | from lxml import etree 6 | from urllib.parse import quote 7 | from urllib.request import Request, urlopen 8 | 9 | """ 10 | Cabeceras para la simulación de un navegador 11 | """ 12 | UA="Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0" 13 | url_list_empieza="https://dle.rae.es/{}/?m=31&f={}" 14 | url_list_termina="https://dle.rae.es/{}/?m=32&f={}" 15 | url_detail="https://dle.rae.es/{}" 16 | 17 | """ 18 | Usamos title por que el contenido en determinadas situaciones cambia: 19 | https://dle.rae.es/abollado?m=31 20 | 21 | abollado1, da 22 | abollado2 23 | 24 | """ 25 | to_remove_from_title='Ir a la entrada ' 26 | 27 | skip = len(to_remove_from_title) 28 | 29 | 30 | def get_xtree(url, param, offset=0): 31 | tree = None 32 | attempt = 10 33 | while attempt > 0 and tree == None: 34 | try: 35 | req = Request(url.format(quote(param), offset), headers={'User-Agent': UA}) 36 | webpage = urlopen(req, timeout=2) # Set the timeout value to 10 seconds 37 | # imprimit contenido 38 | #print(webpage.read()) 39 | htmlparser = etree.HTMLParser() 40 | tree = etree.parse(webpage, htmlparser) 41 | except Exception as e: 42 | attempt -= 1 43 | print(str(e)) 44 | time.sleep(10) 45 | 46 | return tree 47 | 48 | 49 | def try_conjugacion(palabra, dict_dump): 50 | print("Intentamos conjugar " + palabra) 51 | tree = get_xtree(url_detail, palabra) 52 | contains_conjugacion = tree.xpath('//*[@id="resultados"]/*/a[@class="e2"]/@title') 53 | if len(contains_conjugacion) > 0: 54 | print("^" * 80) 55 | print(contains_conjugacion) 56 | # get all contant in tds 57 | conjugacion = tree.xpath('//div[@id="conjugacion"]//td//text()') 58 | conjugacion_clean = ' '.join(conjugacion).replace(', ', ' ').replace(' / ', ' ').split(' ') 59 | for conj in conjugacion_clean: 60 | if(conj!=''): 61 | print(conj) 62 | dict_dump[conj] = conj 63 | 64 | 65 | def try_me_siento_con_suerte(palabra, dict_dump): 66 | # RAE por ejemplo al buscar si, devuelve psicolo, psiblabla, etc... 67 | # esta función prueba la cadena de caracteres en la url, la mayoría dará no pero alguna dará sí. Por ejemplo sí. 68 | # Ahora mismo sí, sí que aparece por la inclusión de las tildes en el lista inicial. 69 | # pero puede haber situaciones de palabras que no estén en la lista de resultado de búsqueda y que sean palabras. 70 | print("Intentamos suerte " + palabra) 71 | tree = get_xtree(url_detail, palabra) 72 | posible_palabra = tree.xpath('//*/h1[@class="c-page-header__title"]/text()') 73 | print(posible_palabra) 74 | if len(posible_palabra) > 0: 75 | print("Aceptamos:" + palabra) 76 | dict_dump[palabra] = palabra 77 | else: 78 | print("Denegamos:" + palabra) 79 | 80 | 81 | """ 82 | Revisar bien con las reglas de https://www.rae.es/dpd/plural 83 | """ 84 | def formar_plural(palabra): 85 | plurales = [] 86 | 87 | # Si la palabra termina en vocal átona o en -e tónica 88 | if palabra[-1] in ['a', 'e', 'i', 'o', 'u']: 89 | plurales.append(palabra + 's') 90 | 91 | # Si la palabra termina en -a o -o tónicas 92 | elif palabra[-1] in ['á', 'ó']: 93 | if palabra not in ['faralá', 'albalá', 'no']: 94 | plurales.append(palabra + 's') 95 | else: 96 | plurales.append(palabra + 'es') 97 | 98 | # Si la palabra termina en -i o -u tónicas 99 | elif palabra[-1] in ['í', 'ú']: 100 | plurales.append(palabra + 's') 101 | plurales.append(palabra + 'es') 102 | 103 | # Si la palabra termina en -y precedida de vocal 104 | elif palabra[-1] == 'y' and len(palabra)>1 and palabra[-2] in ['a', 'e', 'i', 'o', 'u']: 105 | plurales.append(palabra[:-1] + 'es') 106 | if palabra in ['gay', 'jersey', 'espray', 'yóquey']: 107 | plurales.append(palabra[:-1] + 's') 108 | 109 | # Si la palabra termina en -s o -x 110 | elif palabra[-1] in ['s', 'x']: 111 | if palabra[-2:] in ['ás', 'és', 'ís', 'ós', 'ús'] or palabra[-1] == 'x': 112 | plurales.append(palabra + 'es') 113 | else: 114 | plurales.append(palabra) # invariable 115 | 116 | # Si la palabra termina en -l, -r, -n, -d, -z, -j 117 | elif palabra[-1] in ['l', 'r', 'n', 'd', 'z', 'j']: 118 | plurales.append(palabra + 'es') 119 | 120 | # Si la palabra termina en consonantes distintas de las anteriores 121 | elif palabra[-1] not in ['l', 'r', 'n', 'd', 'z', 'j', 's', 'x']: 122 | plurales.append(palabra + 's') 123 | 124 | return plurales 125 | 126 | # Ejemplo de uso: 127 | # palabra = "sofá" 128 | # print(f"Formas posibles del plural de '{palabra}': {formar_plural(palabra)}") 129 | 130 | 131 | def try_plural(palabra, dict_dump): 132 | print("Intentamos plural " + palabra) 133 | plural = formar_plural(palabra) 134 | for pl in plural: 135 | tree = get_xtree(url_detail, pl) 136 | posible_plural = tree.xpath('//*[@id="resultados"]/div[@class="otras"]/p/text()') 137 | if len(posible_plural) > 0 and pl in posible_plural[0]: 138 | print("Aceptamos:" + pl) 139 | dict_dump[pl] = pl 140 | else: 141 | # Puede ser una palabra: a -> plural as, es una palabra. 142 | # Aquí la denegamos. La recogeremos como palabra en otra parte del script 143 | print("Denegamos:" + pl) --------------------------------------------------------------------------------