44 |
45 | The string could be present anywhere inside the tags body <.>
46 | Patterns can be specified in the string using shell patterns
47 | shup "div[Qy?*[!h]f]"
48 | When no filters applied, shup will only format the HTML
49 | ```
50 |
51 | ## Example
52 |
53 | ```sh
54 | curl -s "www.gnu.org" | shup -r "body" "div[inner]" "ul" "li[[pP]hilo]" "a"
55 | ```
56 |
--------------------------------------------------------------------------------
/shup:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Parse HTML
4 | # Basic Tag selection
5 |
6 | # dependencies "sed"
7 | # pystardust: notpiestardust@gmail.com
8 |
9 | Version="1.0.0"
10 | isRaw=0
11 | isText=0
12 | OutputIndent=" "
13 |
14 | Usage () {
15 | while IFS= read _HelpLine; do
16 | printf "%s\n" "$_HelpLine"
17 | done<<-EOF
18 | USAGE: shup [OPTIONS] ["FILTER1" "FILTER2" ...]
19 | -h show this help
20 | -v show version
21 | -r raw: last filter tag will not be shown
22 | -t text: no tags will be shown
23 | -o "string" specify output indentation
24 | HTML must be provided as stdin
25 |
26 | FILTER FORMAT: "
" or "[]"
27 | the search string should be present in the tag line
28 | EXAMPLE
29 | to match all div tags
30 | shup "div"
31 | to match div tags with some string
32 | shup "div[Qynugf]"
33 | or you could be more specific (use single quotes)
34 | shup 'div[class="Qynugf"]'
35 | both will match :
36 |
37 | The string could be present anywhere inside the tags body <.>
38 | Patterns can be specified in the string using shell patterns
39 | shup "div[Qy?*[!h]f]"
40 |
41 | Example with multiple filters
42 | curl -s "www.gnu.org" | shup -r "body" "div[inner]" "ul" "li[[pP]hilo]" "a"
43 |
44 | When no filters applied, shup will only format the HTML
45 | EOF
46 | unset _HelpLine
47 |
48 | }
49 |
50 |
51 | while getopts 'hvrto:' OPT; do
52 | case $OPT in
53 | h) Usage ; exit 0 ;;
54 | v) printf "%s\n" "$Version"; exit 0 ;;
55 | r) isRaw=1 ;;
56 | t) isText=1 ;;
57 | o) OutputIndent="${OPTARG}" ;;
58 | esac
59 | done
60 | shift $((OPTIND - 1))
61 |
62 |
63 | TempFile=$(mktemp)
64 | TempFile2=$(mktemp)
65 |
66 | # Store stdin in a file
67 | while IFS= read Line || [ -n "$Line" ] ; do
68 | printf "%s\n" "$Line"
69 | done > "$TempFile"
70 |
71 |
72 | # Prepend:
73 | # \001 in front of open tags
74 | # \002 in front of close tags
75 | # \003 in front of nonopening tags like: meta img
76 | sed -E -i '
77 | /^[[:space:]]$/d
78 | :loop
79 | s_\\<_\x07_g
80 | s_\\>_\x07_g
81 | /$/{
82 | N
83 | s/\n/ /
84 | b loop
85 | }
86 | /<[^>]*$/{
87 | N
88 | s/\n/ /
89 | b loop
90 | }
91 | s_<[^/][^>]*>_\n\x01&\n_g
92 | s_[^>]*>_\n\x02&\n_g
93 | s_\x07_\\<_g
94 | s/\x01(<(!--|area|base|br|col|command|embed|hr|img|input|keygen|link|meta|param|source|track|wbr)[ >])/\x03\1/g
95 | ' "$TempFile"
96 |
97 |
98 | # Store these Marks in Variables
99 | OpenMark=$(printf '\001')
100 | CloseMark=$(printf '\002')
101 | NMark=$(printf '\003')
102 |
103 | # To output n ($1) number of tabs
104 | TabsOut () {
105 | _i=0
106 | while [ $_i -lt $1 ]; do
107 | printf "$OutputIndent"
108 | _i=$((_i+1))
109 | done
110 | unset _i
111 | }
112 |
113 | # Formatts html and indents every tag block
114 | FormatHtml () {
115 | _Indent=0
116 | while IFS= read _Line; do
117 | case $_Line in
118 | '')
119 | ;;
120 | $OpenMark*)
121 | TabsOut $_Indent
122 | printf "%s\n" "${_Line#$OpenMark}"
123 | _Indent=$((_Indent + 1))
124 | ;;
125 | $CloseMark*)
126 | _Indent=$((_Indent - 1))
127 | TabsOut $_Indent
128 | printf "%s\n" "${_Line#$CloseMark}"
129 | ;;
130 | $NMark*)
131 | TabsOut $_Indent
132 | printf "%s\n" "${_Line#$NMark}"
133 | ;;
134 | *)
135 | TabsOut $_Indent
136 | printf "%s\n" "$_Line"
137 | ;;
138 | esac
139 | done
140 | unset _Indent _Line
141 | }
142 |
143 | FormatHtmlText () {
144 | while IFS= read _Line; do
145 | case $_Line in
146 | ''|$OpenMark*|$CloseMark*| $NMark*)
147 | ;;
148 | *)
149 | printf "%s\n" "$_Line"
150 | ;;
151 | esac
152 | done
153 | unset _Line
154 | }
155 |
156 | # Gets The specified tag's body
157 | GetOpenCloseTag () {
158 | _Flag=0
159 | _Tag=$1
160 | _Pattern=$2
161 | Raw=${Raw:-1}
162 |
163 | while IFS= read _Line; do
164 | case $_Line in
165 | # once pattern is matched _Flag is greater than 1
166 | $OpenMark'<'$_Tag*$_Pattern)
167 | [ $_Flag -gt 0 ] && printf "%s\n" "$_Line"
168 | [ $_Flag -eq 0 ] && [ $Raw -eq 0 ] && printf "%s\n" "$_Line"
169 | _Flag=$((_Flag + 1))
170 | ;;
171 | $OpenMark'<'$_Tag*)
172 | if [ $_Flag -gt 0 ]; then
173 | printf "%s\n" "$_Line"
174 | _Flag=$((_Flag + 1))
175 | fi
176 | ;;
177 | $CloseMark'<'/$_Tag*)
178 | if [ $_Flag -gt 1 ]; then
179 | printf "%s\n" "$_Line"
180 | _Flag=$((_Flag - 1))
181 | elif [ $_Flag -eq 1 ]; then
182 | [ $Raw -eq 0 ] && printf "%s\n" "$_Line"
183 | _Flag=$((_Flag - 1))
184 | fi
185 | ;;
186 | *)
187 | [ $_Flag -gt 0 ] && printf "%s\n" "$_Line"
188 | ;;
189 | esac
190 | done
191 | unset _Tag _Pattern _Flag _Line
192 | }
193 |
194 |
195 | # Gets The specified non open close tag
196 | GetSingletonTag () {
197 | _Tag=$1
198 | while IFS= read Line; do
199 | case $Line in
200 | $NMark'<'$_Tag*$Pattern)
201 | printf "%s\n" "${Line}"
202 | ;;
203 | esac
204 | done
205 | unset _Tag
206 | }
207 |
208 | GetTagAndPattern () {
209 | Tag=${Selection%%[*}
210 | Pattern=${Selection#*[}
211 |
212 | if [ "$Pattern" = "$Selection" ]; then
213 | Pattern=
214 | else
215 | Pattern="${Pattern%]}*"
216 | fi
217 | }
218 |
219 | #Apply filters
220 | while [ -n "$1" ]; do
221 | Selection=$1
222 | GetTagAndPattern
223 |
224 | shift
225 | # decide raw on the last filter
226 | [ $isRaw -eq 0 ] && [ -z "$1" ] && Raw=0
227 |
228 | case $Tag in
229 | area|base|br|col|command|embed|hr|img|input|keygen|link|meta|param|source|track|wbr)
230 | GetSingletonTag "$Tag" "$Pattern"
231 | ;;
232 | *)
233 | GetOpenCloseTag "$Tag" "$Pattern"
234 | ;;
235 | esac < "$TempFile" > "$TempFile2"
236 | cp "$TempFile2" "$TempFile"
237 | done
238 |
239 | # Format and print the text
240 | if [ $isText -eq 0 ]; then
241 | FormatHtml <"$TempFile"
242 | else
243 | FormatHtmlText <"$TempFile"
244 | fi
245 |
246 | # Clean up
247 | rm -f "$TempFile" "$TempFile2"
248 |
--------------------------------------------------------------------------------