├── requirements.txt
├── data
├── raw
│ ├── allwords_a.pkl
│ ├── allwords_b.pkl
│ ├── allwords_c.pkl
│ ├── allwords_d.pkl
│ ├── allwords_e.pkl
│ ├── allwords_f.pkl
│ ├── allwords_g.pkl
│ ├── allwords_h.pkl
│ ├── allwords_i.pkl
│ ├── allwords_j.pkl
│ ├── allwords_k.pkl
│ ├── allwords_l.pkl
│ ├── allwords_m.pkl
│ ├── allwords_n.pkl
│ ├── allwords_o.pkl
│ ├── allwords_p.pkl
│ ├── allwords_q.pkl
│ ├── allwords_r.pkl
│ ├── allwords_s.pkl
│ ├── allwords_t.pkl
│ ├── allwords_u.pkl
│ ├── allwords_v.pkl
│ ├── allwords_w.pkl
│ ├── allwords_x.pkl
│ ├── allwords_y.pkl
│ ├── allwords_z.pkl
│ ├── allwords_á.pkl
│ ├── allwords_é.pkl
│ ├── allwords_í.pkl
│ ├── allwords_ñ.pkl
│ ├── allwords_ó.pkl
│ ├── allwords_ú.pkl
│ ├── allwords_ü.pkl
│ ├── allwords_termina_a.pkl
│ ├── allwords_termina_b.pkl
│ ├── allwords_termina_c.pkl
│ ├── allwords_termina_d.pkl
│ ├── allwords_termina_e.pkl
│ ├── allwords_termina_f.pkl
│ ├── allwords_termina_g.pkl
│ ├── allwords_termina_h.pkl
│ ├── allwords_termina_i.pkl
│ ├── allwords_termina_j.pkl
│ ├── allwords_termina_k.pkl
│ ├── allwords_termina_l.pkl
│ ├── allwords_termina_m.pkl
│ ├── allwords_termina_n.pkl
│ ├── allwords_termina_o.pkl
│ ├── allwords_termina_p.pkl
│ ├── allwords_termina_q.pkl
│ ├── allwords_termina_r.pkl
│ ├── allwords_termina_s.pkl
│ ├── allwords_termina_t.pkl
│ ├── allwords_termina_u.pkl
│ ├── allwords_termina_v.pkl
│ ├── allwords_termina_w.pkl
│ ├── allwords_termina_x.pkl
│ ├── allwords_termina_y.pkl
│ ├── allwords_termina_z.pkl
│ ├── allwords_termina_á.pkl
│ ├── allwords_termina_é.pkl
│ ├── allwords_termina_í.pkl
│ ├── allwords_termina_ñ.pkl
│ ├── allwords_termina_ó.pkl
│ ├── allwords_termina_ú.pkl
│ └── allwords_termina_ü.pkl
└── archive
│ └── 2024-05-22
│ ├── 0_subfijos.txt
│ └── 0_prefijos.txt
├── src
├── __pycache__
│ └── helpers.cpython-312.pyc
├── spliter.sh
├── reorder.py
├── starting_letter.sh
├── length.sh
├── post.py
├── post_process.py
├── rae_downloader.py
└── helpers.py
└── README.md
/requirements.txt:
--------------------------------------------------------------------------------
1 | lxml
2 | argparse
3 |
--------------------------------------------------------------------------------
/data/raw/allwords_a.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_a.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_b.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_b.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_c.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_c.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_d.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_d.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_e.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_e.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_f.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_f.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_g.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_g.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_h.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_h.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_i.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_i.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_j.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_j.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_k.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_k.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_l.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_l.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_m.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_m.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_n.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_n.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_o.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_o.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_p.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_p.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_q.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_q.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_r.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_r.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_s.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_s.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_t.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_t.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_u.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_u.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_v.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_v.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_w.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_w.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_x.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_x.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_y.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_y.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_z.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_z.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_á.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_á.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_é.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_é.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_í.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_í.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_ñ.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_ñ.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_ó.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_ó.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_ú.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_ú.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_ü.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_ü.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_a.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_a.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_b.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_b.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_c.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_c.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_d.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_d.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_e.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_e.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_f.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_f.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_g.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_g.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_h.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_h.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_i.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_i.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_j.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_j.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_k.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_k.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_l.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_l.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_m.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_m.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_n.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_n.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_o.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_o.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_p.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_p.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_q.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_q.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_r.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_r.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_s.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_s.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_t.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_t.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_u.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_u.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_v.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_v.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_w.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_w.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_x.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_x.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_y.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_y.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_z.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_z.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_á.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_á.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_é.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_é.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_í.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_í.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_ñ.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_ñ.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_ó.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_ó.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_ú.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_ú.pkl
--------------------------------------------------------------------------------
/data/raw/allwords_termina_ü.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/data/raw/allwords_termina_ü.pkl
--------------------------------------------------------------------------------
/src/__pycache__/helpers.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JorgeDuenasLerin/diccionario-espanol-txt/HEAD/src/__pycache__/helpers.cpython-312.pyc
--------------------------------------------------------------------------------
/data/archive/2024-05-22/0_subfijos.txt:
--------------------------------------------------------------------------------
1 | -acanto
2 | -bia
3 | -bio
4 | -cardio
5 | -céfala
6 | -céfalo
7 | -cito
8 | -dáctilo
9 | -dermo
10 | -ferro
11 | -fita
12 | -fito
13 | -lito
14 | -morfa
15 | -morfo
16 | -zoo
17 |
--------------------------------------------------------------------------------
/src/spliter.sh:
--------------------------------------------------------------------------------
1 | #ª/bin/bash
2 |
3 | LETRAS=('a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'ñ' 'o' 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z')
4 |
5 | cat palabras_todas.txt | grep -v '.*-$' | grep -v ^- | sort | uniq > 0_palabras_todas.txt
6 | cat palabras_todas.txt | grep '.*-$' | sort | uniq > 0_prefijos.txt
7 | cat palabras_todas.txt | grep ^- | sort | uniq > 0_subfijos.txt
8 |
--------------------------------------------------------------------------------
/src/reorder.py:
--------------------------------------------------------------------------------
1 | import time
2 | import argparse
3 | import pickle
4 | import pyuca
5 | import urllib.request
6 |
7 | collator = pyuca.Collator("src/allkeys.txt")
8 |
9 | palabras = []
10 |
11 | with open("data/0_palabras_todas.txt", 'r') as f:
12 | for line in f:
13 | palabras.append (line.strip())
14 |
15 | palabras = sorted(list(set(palabras)), key=collator.sort_key)
16 |
17 | with open("data/0_palabras_todas_sorted.txt", 'w') as f:
18 | for palabra in palabras:
19 | f.write(palabra + '\n')
20 |
--------------------------------------------------------------------------------
/src/starting_letter.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # By: @hasecilu, some lines were extracted from spliter.sh file
4 |
5 | if [ ${PWD##*/} = "diccionario-espanol-txt" ]
6 | then
7 | mkdir -pv starting_letter
8 | LETRAS=('a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'ñ' 'o' 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z')
9 |
10 | for l in "${LETRAS[@]}"; do
11 | cat 0_palabras_todas.txt | grep ^$l > starting_letter/$l.txt
12 | done
13 |
14 | wc -l starting_letter/*
15 | else
16 | echo "Go to the diccionario-espanol-txt folder"
17 | fi
18 |
19 |
--------------------------------------------------------------------------------
/src/length.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # By: @hasecilu
4 |
5 | if [ ${PWD##*/} = "diccionario-espanol-txt" ]
6 | then
7 | mkdir -pv length
8 |
9 | FILENAME="0_palabras_todas.txt"
10 | LINES=$(cat $FILENAME)
11 |
12 | COUNTER=0
13 |
14 | for LINE in $LINES
15 | do
16 | if [ ${#LINE} -lt 10 ]
17 | then
18 | echo $LINE >> ./length/0${#LINE}.txt
19 | else
20 | echo $LINE >> ./length/${#LINE}.txt # time: 7.01s user 2.54s system 99% cpu 9.595 total
21 | # echo $LINE | tee -a ./length/${#LINE}.txt # time: 1263.37s user 3400.90s system 131% cpu 58:60.00 total
22 | fi
23 |
24 | COUNTER=$((COUNTER+1))
25 | done
26 |
27 | wc -l ./length/*
28 | else
29 | echo "Go to the diccioanrio-espanol-folder"
30 | fi
31 |
32 |
--------------------------------------------------------------------------------
/src/post.py:
--------------------------------------------------------------------------------
1 | import time
2 | import argparse
3 | import pickle
4 | import pyuca
5 | import urllib.request
6 |
7 | parser = argparse.ArgumentParser(description='RAE Process data.')
8 | parser.add_argument('--inputfile', metavar='outfile no extension', type=str, default="data/allwords")
9 | parser.add_argument('--outputfile', metavar='outputfile', type=str, default="data/allwords.txt")
10 | args = parser.parse_args()
11 |
12 |
13 | letras = ['a', 'á', 'b', 'c', 'd', 'e', 'é', 'f', 'g', 'h', 'i', 'í', 'j', 'k', 'l', 'm',
14 | 'n', 'ñ', 'o', 'ó', 'p', 'q', 'r', 's', 't', 'u', 'ú', 'ü', 'v', 'w', 'x', 'y', 'z']
15 |
16 | collator = pyuca.Collator()
17 |
18 |
19 | palabras = []
20 |
21 | for l in letras:
22 | with open(f"{args.inputfile}_{l}.pkl", 'rb') as f:
23 | words = pickle.load(f)
24 | keys = words.keys()
25 | palabras += keys
26 |
27 |
28 | palabras = sorted(list(set(palabras)), key=collator.getSortKey)
29 |
30 |
31 | with open(args.outputfile, 'w') as f:
32 | for palabra in palabras:
33 | f.write(palabra + '\n')
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/src/post_process.py:
--------------------------------------------------------------------------------
1 | import time
2 | import argparse
3 | import pickle
4 | import pyuca
5 | import urllib.request
6 |
7 | parser = argparse.ArgumentParser(description='RAE Process data.')
8 | parser.add_argument('--inputfile', metavar='outfile no extension', type=str, default="data/raw/allwords")
9 | parser.add_argument('--termina', default="", type=str)
10 | parser.add_argument('--outputfile', metavar='outputfile', type=str, default="data/allwords")
11 | args = parser.parse_args()
12 |
13 | def save_file(lista, file):
14 | with open(file, 'w') as f:
15 | for item in lista:
16 | f.write(item + '\n')
17 |
18 |
19 | letras = ['a', 'á', 'b', 'c', 'd', 'e', 'é', 'f', 'g', 'h', 'i', 'í', 'j', 'k', 'l', 'm',
20 | 'n', 'ñ', 'o', 'ó', 'p', 'q', 'r', 's', 't', 'u', 'ú', 'ü', 'v', 'w', 'x', 'y', 'z']
21 |
22 |
23 | if args.termina != "":
24 | termina = "data/raw/allwords_termina"
25 | else:
26 | termina = None
27 |
28 | collator = pyuca.Collator("src/allkeys.txt")
29 |
30 | palabras = []
31 |
32 | for l in letras:
33 | with open(f"{args.inputfile}_{l}.pkl", 'rb') as f:
34 | words = pickle.load(f)
35 | keys = words.keys()
36 | palabras += keys
37 | if termina:
38 | with open(f"{termina}_{l}.pkl", 'rb') as f:
39 | words = pickle.load(f)
40 | keys = words.keys()
41 | palabras += keys
42 |
43 |
44 |
45 | palabras = sorted(list(set(palabras)), key=collator.sort_key)
46 |
47 | #paraborrar = ["(impersonal:", "(solo", ")"]
48 | #for borrar in paraborrar:
49 | # palabras.remove(borrar)
50 |
51 |
52 | save_file(palabras, f"{args.outputfile}.txt")
53 |
54 | "-", ""
55 |
56 | """
57 |
58 |
59 |
60 |
61 | longitudes = {}
62 |
63 | for palabra in palabras:
64 | longitud = len(palabra)
65 | if longitud in longitudes:
66 | longitudes[longitud] += palabra
67 | else:
68 | longitudes[longitud] = [palabra]
69 |
70 | for longitud in longitudes:
71 | longitudes[longitud] = sorted(longitudes[longitud], key=collator.sort_key)
72 |
73 | """
74 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # List of all spanish words
2 |
3 | This project has gained a lot of attention from researchers and application developers. I think that this information should be provided by the RAE organization. Meanwhile you can find the information here.
4 |
5 | > Updated with RAE server in: 2025-02-10
6 |
7 | ## Data layout
8 |
9 | ```txt
10 | \- src: python source code
11 | \- data
12 | \- analysis (WorkInProgress)
13 | \- clean (WorkInProgress)
14 | \- meanings (WorkInProgress)
15 | \- raw
16 | \- archive
17 | ```
18 |
19 | ## Running
20 |
21 | Steps:
22 |
23 | 1. install requeriments
24 | 2. run web scrapper (src/rae_downloader.py) saved as pickle files
25 | 3. run post process ( convert to txt, sort, cleaning, etc.)
26 |
27 |
28 |
29 |
30 |
31 | ## Outdated information.
32 |
33 | Usage
34 | ```
35 | usage: rae_downloader.py [-h] [--conjugaciones] [--skip-conjugaciones]
36 | [--outfile outfile]
37 | [--outfile outfile]
38 |
39 | RAE Downloader.
40 |
41 | optional arguments:
42 | -h, --help show this help message and exit
43 | --conjugaciones
44 | --skip-conjugaciones
45 | --outfile outfile
46 | ```
47 |
48 | Words in file has no order and can be duplicades:
49 |
50 | ```
51 | cat palabras_todas.txt | grep -v '.*-$' | grep -v ^- | sort | uniq > 0_palabras_todas.txt
52 | --outfile outfile
53 | ```
54 |
55 | Words in file has no order and can be duplicades:
56 |
57 | ```
58 | cat palabras_todas.txt | grep -v '.*-$' | grep -v ^- | sort | uniq > 0_palabras_todas.txt
59 | ```
60 |
61 | ### Classify words by their length
62 |
63 | The `0_palabras_todas.txt` file is needed.
64 |
65 | Inside the `diccionario-espanol-txt` folder and running the `length.sh` file will create the `length` folder with the words classified by its length.
66 |
67 | ```
68 | bash src/length.sh
69 | ```
70 |
71 | ### Classify words by their first letter
72 |
73 | The `0_palabras_todas.txt` file is needed.
74 |
75 |
76 | Due to the lack of `palabras_todas.txt` file (creating it will last so many hours) the `spliter.sh` file will not work. So this script works with the `0_palabras_todas.txt` file.
77 |
78 | Inside the `diccionario-espanol-txt` folder and running the `starting_letter.sh` file will create the `starting_letter` folder with the words classified by the first letter.
79 |
80 | ```
81 | bash src/starting_letter.sh
82 | ```
83 |
84 |
85 | ## Conjugaciones
86 |
87 |
88 | ## Remember
89 |
90 | Doble check after download:
91 |
92 | - There is words starting by á, é, etc.
93 | - Check plurals: gato, gata, gatos, gatas.
94 |
95 | ## Changelog
96 |
97 | 2024-10-20:
98 | - Some variable names typos corrected
99 | - Try to get plurals
100 | - Verifica ababílla
101 |
--------------------------------------------------------------------------------
/data/archive/2024-05-22/0_prefijos.txt:
--------------------------------------------------------------------------------
1 | a-
2 | acanto-
3 | acro-
4 | acui-
5 | ad-
6 | adeno-
7 | aero-
8 | afro-
9 | agro-
10 | alo-
11 | alti-
12 | ana-
13 | anarco-
14 | andro-
15 | anemo-
16 | anfi-
17 | angio-
18 | anglo-
19 | aniso-
20 | ante-
21 | anti-
22 | antropo-
23 | apico-
24 | arbori-
25 | archi-
26 | arqueo-
27 | arqui-
28 | astro-
29 | atto-
30 | audio-
31 | austro-
32 | auto-
33 | baro-
34 | bi-
35 | biblio-
36 | bio-
37 | bis-
38 | biz-
39 | bradi-
40 | cardio-
41 | cata-
42 | cefalo-
43 | centi-
44 | cian-
45 | ciano-
46 | ciber-
47 | circun-
48 | cis-
49 | cito-
50 | clepto-
51 | co-
52 | con-
53 | contra-
54 | cosmo-
55 | crio-
56 | cripto-
57 | cromo-
58 | crono-
59 | cuadr-
60 | cuadri-
61 | cuadru-
62 | cuasi-
63 | cuatri-
64 | dactilo-
65 | de-
66 | deca-
67 | deci-
68 | demo-
69 | denti-
70 | dento-
71 | derm-
72 | dermat-
73 | dermato-
74 | dermo-
75 | des-
76 | di-
77 | dia-
78 | dis-
79 | dodeca-
80 | e-
81 | eco-
82 | ecto-
83 | electro-
84 | en-
85 | endeca-
86 | endo-
87 | enea-
88 | eno-
89 | entero-
90 | entre-
91 | equi-
92 | eritro-
93 | es-
94 | esclero-
95 | estereo-
96 | etno-
97 | euro-
98 | ex-
99 | exa-
100 | exo-
101 | extra-
102 | fago-
103 | femto-
104 | ferro-
105 | filo-
106 | fisio-
107 | fito-
108 | fono-
109 | foto-
110 | franco-
111 | galacto-
112 | galo-
113 | gamo-
114 | gastero-
115 | gastro-
116 | geo-
117 | germano-
118 | geronto-
119 | giga-
120 | gineco-
121 | gluco-
122 | grafo-
123 | halo-
124 | hecto-
125 | helico-
126 | helio-
127 | hema-
128 | hemato-
129 | hemi-
130 | hemo-
131 | hepato-
132 | hepta-
133 | hetero-
134 | hexa-
135 | hidro-
136 | higro-
137 | hiper-
138 | hipo-
139 | hispano-
140 | histo-
141 | holo-
142 | homeo-
143 | homo-
144 | ibero-
145 | in-
146 | indo-
147 | infra-
148 | inmuno-
149 | inter-
150 | intra-
151 | islamo-
152 | iso-
153 | italo-
154 | kili-
155 | kilo-
156 | leuco-
157 | linfo-
158 | lipo-
159 | lito-
160 | macro-
161 | magneto-
162 | masto-
163 | maxi-
164 | mega-
165 | meso-
166 | meta-
167 | micro-
168 | mili-
169 | mini-
170 | mio-
171 | miria-
172 | mono-
173 | morfo-
174 | moto-
175 | muco-
176 | multi-
177 | nano-
178 | narco-
179 | necro-
180 | nefro-
181 | neo-
182 | neumo-
183 | neuro-
184 | nitro-
185 | nor-
186 | nord-
187 | octa-
188 | octo-
189 | oligo-
190 | onco-
191 | onto-
192 | ornito-
193 | orto-
194 | osteo-
195 | paleo-
196 | pan-
197 | para-
198 | pato-
199 | penta-
200 | per-
201 | peri-
202 | peta-
203 | pico-
204 | piro-
205 | pluri-
206 | podo-
207 | poli-
208 | pos-
209 | pre-
210 | pro-
211 | proto-
212 | psico-
213 | ptero-
214 | quimio-
215 | quiro-
216 | radio-
217 | re-
218 | requete-
219 | res-
220 | rete-
221 | retro-
222 | rino-
223 | rizo-
224 | sarco-
225 | seleno-
226 | semi-
227 | sero-
228 | servo-
229 | sesqui-
230 | seudo-
231 | sin-
232 | so-
233 | sobre-
234 | socio-
235 | son-
236 | sota-
237 | soto-
238 | sub-
239 | sud-
240 | super-
241 | supra-
242 | sur-
243 | tanato-
244 | taqui-
245 | tardo-
246 | tecno-
247 | tele-
248 | tera-
249 | termo-
250 | tetra-
251 | trans-
252 | tri-
253 | tribo-
254 | turbo-
255 | ultra-
256 | uni-
257 | vi-
258 | vice-
259 | video-
260 | viz-
261 | xeno-
262 | xero-
263 | xilo-
264 | zoo-
265 |
--------------------------------------------------------------------------------
/src/rae_downloader.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # Desarrollado por Jorge Dueñas Lerín
4 |
5 | from urllib.parse import quote
6 | from urllib.request import Request, urlopen
7 | from lxml import etree
8 |
9 | import time
10 | import argparse
11 | import pickle
12 |
13 | from helpers import get_xtree, try_conjugacion, try_plural, try_me_siento_con_suerte, url_list_empieza, url_list_termina, skip
14 |
15 |
16 | parser = argparse.ArgumentParser(description='RAE Downloader.')
17 | parser.add_argument('--ix', metavar='ix', type=int, required=True, help='Start with this letter index')
18 | parser.add_argument('--termina', dest='termina', action='store_true')
19 | parser.add_argument('--conjugaciones', action='store_true')
20 | parser.add_argument('--skip-conjugaciones', dest='conjugaciones', action='store_false')
21 | parser.set_defaults(conjugaciones=True)
22 | parser.add_argument('--plurals', default=True)
23 | parser.add_argument('--outfile', metavar='outfile no extension', type=str, default="data/raw/allwords")
24 | args = parser.parse_args()
25 |
26 |
27 | letras = ['a', 'á', 'b', 'c', 'd', 'e', 'é', 'f', 'g', 'h', 'i', 'í', 'j', 'k', 'l', 'm',
28 | 'n', 'ñ', 'o', 'ó', 'p', 'q', 'r', 's', 't', 'u', 'ú', 'ü', 'v', 'w', 'x', 'y', 'z']
29 | #letras = ['s', 'i', 'í']
30 | letras_count = len(letras)
31 | start = letras[args.ix]
32 | print(f"Running with {args.ix}/{letras_count}: {start}")
33 | start_with = [start]
34 | dict_dump = {}
35 |
36 | if args.termina:
37 | url_list = url_list_termina
38 | else:
39 | url_list = url_list_empieza
40 |
41 | NITEMS=20
42 |
43 | def procesa(palabras):
44 | # Se repiten palabras. Cuando por ejemplo aba tiene más de 30 y se exapande
45 | # abaa, abab, etc... las primeras palabras no aparecen: aba
46 | numpal = len(palabras)
47 | for ix, pal in enumerate(palabras):
48 | if pal.startswith(","):
49 | print("Tratada antes", pal)
50 | continue
51 |
52 | if ix+1 < numpal and palabras[ix+1].startswith(","):
53 | pal = pal + palabras[ix+1]
54 |
55 | print(pal)
56 | dict_dump[pal] = pal
57 |
58 | """
59 | This code is comented. It is not update with the last version of the RAE website.
60 | TODO.
61 |
62 | if ", " not in pal_clean:
63 | pal_list.append(pal_clean)
64 | else:
65 | pal_clean = pal_clean.split(", ")
66 | for pal_clean_multi in pal_clean:
67 | pal_list.append(pal_clean_multi)
68 | """
69 | """
70 | for pal_ix in pal_list:
71 |
72 | #if args.conjugaciones:
73 | # try_conjugacion(pal_ix, dict_dump)
74 | # try_plural(pal_ix, dict_dump)
75 | """
76 |
77 |
78 | while len(start_with) != 0:
79 | palabra_start_with = start_with.pop(0)
80 |
81 | if(palabra_start_with in ['app', 'docs', 'js']): # RAE servers do not like this
82 | continue
83 |
84 | try_me_siento_con_suerte(palabra_start_with, dict_dump)
85 |
86 | tree = get_xtree(url_list, palabra_start_with)
87 | pags = tree.xpath('//*/*[@class="c-pagination"]/*/text()')
88 |
89 | res = tree.xpath('//*/article/h3/a/text()')
90 | procesa(res)
91 |
92 | if pags:
93 | npags = max([int(x,0) for x in pags if x.isdigit()])
94 | print("Hay páginas")
95 | for page in range(npags):
96 | if page == 0:
97 | continue
98 | print("Página: " + str(page))
99 | fparam = page*NITEMS
100 |
101 | tree = get_xtree(url_list, palabra_start_with, fparam)
102 | res = tree.xpath('//*/article/h3/a/text()')
103 | res = res + tree.xpath('//*/article/h3/a/i/text()')
104 | procesa(res)
105 |
106 | else:
107 | print("No hay páginas")
108 |
109 | if pags:
110 | print("!" * 80)
111 | print("EXAPEND: " + palabra_start_with)
112 | expand = [palabra_start_with + l for l in letras]
113 | start_with = expand + start_with
114 |
115 |
116 | pickle.dump(dict_dump, open(f"{args.outfile}_{start}.pkl", "wb"))
117 |
--------------------------------------------------------------------------------
/src/helpers.py:
--------------------------------------------------------------------------------
1 | import time
2 | import argparse
3 | import pickle
4 |
5 | from lxml import etree
6 | from urllib.parse import quote
7 | from urllib.request import Request, urlopen
8 |
9 | """
10 | Cabeceras para la simulación de un navegador
11 | """
12 | UA="Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
13 | url_list_empieza="https://dle.rae.es/{}/?m=31&f={}"
14 | url_list_termina="https://dle.rae.es/{}/?m=32&f={}"
15 | url_detail="https://dle.rae.es/{}"
16 |
17 | """
18 | Usamos title por que el contenido en determinadas situaciones cambia:
19 | https://dle.rae.es/abollado?m=31
20 |
21 | abollado1, da
22 | abollado2
23 |
24 | """
25 | to_remove_from_title='Ir a la entrada '
26 |
27 | skip = len(to_remove_from_title)
28 |
29 |
30 | def get_xtree(url, param, offset=0):
31 | tree = None
32 | attempt = 10
33 | while attempt > 0 and tree == None:
34 | try:
35 | req = Request(url.format(quote(param), offset), headers={'User-Agent': UA})
36 | webpage = urlopen(req, timeout=2) # Set the timeout value to 10 seconds
37 | # imprimit contenido
38 | #print(webpage.read())
39 | htmlparser = etree.HTMLParser()
40 | tree = etree.parse(webpage, htmlparser)
41 | except Exception as e:
42 | attempt -= 1
43 | print(str(e))
44 | time.sleep(10)
45 |
46 | return tree
47 |
48 |
49 | def try_conjugacion(palabra, dict_dump):
50 | print("Intentamos conjugar " + palabra)
51 | tree = get_xtree(url_detail, palabra)
52 | contains_conjugacion = tree.xpath('//*[@id="resultados"]/*/a[@class="e2"]/@title')
53 | if len(contains_conjugacion) > 0:
54 | print("^" * 80)
55 | print(contains_conjugacion)
56 | # get all contant in tds
57 | conjugacion = tree.xpath('//div[@id="conjugacion"]//td//text()')
58 | conjugacion_clean = ' '.join(conjugacion).replace(', ', ' ').replace(' / ', ' ').split(' ')
59 | for conj in conjugacion_clean:
60 | if(conj!=''):
61 | print(conj)
62 | dict_dump[conj] = conj
63 |
64 |
65 | def try_me_siento_con_suerte(palabra, dict_dump):
66 | # RAE por ejemplo al buscar si, devuelve psicolo, psiblabla, etc...
67 | # esta función prueba la cadena de caracteres en la url, la mayoría dará no pero alguna dará sí. Por ejemplo sí.
68 | # Ahora mismo sí, sí que aparece por la inclusión de las tildes en el lista inicial.
69 | # pero puede haber situaciones de palabras que no estén en la lista de resultado de búsqueda y que sean palabras.
70 | print("Intentamos suerte " + palabra)
71 | tree = get_xtree(url_detail, palabra)
72 | posible_palabra = tree.xpath('//*/h1[@class="c-page-header__title"]/text()')
73 | print(posible_palabra)
74 | if len(posible_palabra) > 0:
75 | print("Aceptamos:" + palabra)
76 | dict_dump[palabra] = palabra
77 | else:
78 | print("Denegamos:" + palabra)
79 |
80 |
81 | """
82 | Revisar bien con las reglas de https://www.rae.es/dpd/plural
83 | """
84 | def formar_plural(palabra):
85 | plurales = []
86 |
87 | # Si la palabra termina en vocal átona o en -e tónica
88 | if palabra[-1] in ['a', 'e', 'i', 'o', 'u']:
89 | plurales.append(palabra + 's')
90 |
91 | # Si la palabra termina en -a o -o tónicas
92 | elif palabra[-1] in ['á', 'ó']:
93 | if palabra not in ['faralá', 'albalá', 'no']:
94 | plurales.append(palabra + 's')
95 | else:
96 | plurales.append(palabra + 'es')
97 |
98 | # Si la palabra termina en -i o -u tónicas
99 | elif palabra[-1] in ['í', 'ú']:
100 | plurales.append(palabra + 's')
101 | plurales.append(palabra + 'es')
102 |
103 | # Si la palabra termina en -y precedida de vocal
104 | elif palabra[-1] == 'y' and len(palabra)>1 and palabra[-2] in ['a', 'e', 'i', 'o', 'u']:
105 | plurales.append(palabra[:-1] + 'es')
106 | if palabra in ['gay', 'jersey', 'espray', 'yóquey']:
107 | plurales.append(palabra[:-1] + 's')
108 |
109 | # Si la palabra termina en -s o -x
110 | elif palabra[-1] in ['s', 'x']:
111 | if palabra[-2:] in ['ás', 'és', 'ís', 'ós', 'ús'] or palabra[-1] == 'x':
112 | plurales.append(palabra + 'es')
113 | else:
114 | plurales.append(palabra) # invariable
115 |
116 | # Si la palabra termina en -l, -r, -n, -d, -z, -j
117 | elif palabra[-1] in ['l', 'r', 'n', 'd', 'z', 'j']:
118 | plurales.append(palabra + 'es')
119 |
120 | # Si la palabra termina en consonantes distintas de las anteriores
121 | elif palabra[-1] not in ['l', 'r', 'n', 'd', 'z', 'j', 's', 'x']:
122 | plurales.append(palabra + 's')
123 |
124 | return plurales
125 |
126 | # Ejemplo de uso:
127 | # palabra = "sofá"
128 | # print(f"Formas posibles del plural de '{palabra}': {formar_plural(palabra)}")
129 |
130 |
131 | def try_plural(palabra, dict_dump):
132 | print("Intentamos plural " + palabra)
133 | plural = formar_plural(palabra)
134 | for pl in plural:
135 | tree = get_xtree(url_detail, pl)
136 | posible_plural = tree.xpath('//*[@id="resultados"]/div[@class="otras"]/p/text()')
137 | if len(posible_plural) > 0 and pl in posible_plural[0]:
138 | print("Aceptamos:" + pl)
139 | dict_dump[pl] = pl
140 | else:
141 | # Puede ser una palabra: a -> plural as, es una palabra.
142 | # Aquí la denegamos. La recogeremos como palabra en otra parte del script
143 | print("Denegamos:" + pl)
--------------------------------------------------------------------------------