1 | package com.renomad.minum.htmlparsing; | |
2 | ||
3 | import com.renomad.minum.security.ForbiddenUseException; | |
4 | import com.renomad.minum.utils.RingBuffer; | |
5 | ||
6 | import java.io.ByteArrayInputStream; | |
7 | import java.nio.charset.StandardCharsets; | |
8 | import java.util.*; | |
9 | ||
10 | /** | |
11 | * Converts HTML strings to object trees. | |
12 | * <p> | |
13 | * Enables a developer to analyze an HTML document by its | |
14 | * structure. | |
15 | * </p> | |
16 | * <p> | |
17 | * Note: HTML parsing is difficult because | |
18 | * of its lenient specification. See Postel's Law. | |
19 | * </p> | |
20 | * <p> | |
21 | * For our purposes, it is less important | |
22 | * to perfectly meet the criteria of the spec, so | |
23 | * there will be numerous edge-cases unaccounted-for | |
24 | * by this implementation. Nevertheless, this program | |
25 | * should suit many needs for ordinary web applications. | |
26 | * </p> | |
27 | */ | |
28 | public final class HtmlParser { | |
29 | ||
30 | /** | |
31 | * Most total chars we'll read. | |
32 | */ | |
33 | static final int MAX_HTML_SIZE = 2 * 1024 * 1024; | |
34 | ||
35 | /** | |
36 | * Given any HTML input, scan through and generate a tree | |
37 | * of HTML nodes. Return a list of the roots of the tree. | |
38 | * <p> | |
39 | * This parser operates with a very particular paradigm in mind. I'll explain | |
40 | * it through examples. Let's look at some typical HTML: | |
41 | * </p> | |
42 | * <pre>{@code <p>Hello world</p>}</pre> | |
43 | * <p> | |
44 | * The way we will model this is as follows: | |
45 | * </p> | |
46 | * <pre>{@code <ELEMENT_NAME_AND_DETAILS>content<END_OF_ELEMENT>}</pre> | |
47 | * <p> | |
48 | * We will examine the first part, "ELEMENT_NAME_AND_DETAILS", and | |
49 | * grab the element's name and any attributes. Then we will descend into the | |
50 | * content section. We know we have hit the end of the element by keeping | |
51 | * track of how far we have descended/ascended and whether we are hitting | |
52 | * a closing HTML element. | |
53 | * </p> | |
54 | * <p> | |
55 | * Complicating this is that elements may not have content, for example | |
56 | * any <a href="https://www.w3.org/TR/2011/WD-html-markup-20110113/syntax.html#void-element_xref3">void elements</a> | |
57 | * or when a user chooses to create an empty tag | |
58 | * </p> | |
59 | */ | |
60 | public List<HtmlParseNode> parse(String input) { | |
61 |
2
1. parse : changed conditional boundary → KILLED 2. parse : negated conditional → KILLED |
if (input.length() > MAX_HTML_SIZE) |
62 | throw new ForbiddenUseException("Input exceeds max allowed HTML text size, " + MAX_HTML_SIZE + " chars"); | |
63 | var is = new ByteArrayInputStream(input.getBytes(StandardCharsets.UTF_8)); | |
64 | ||
65 | List<HtmlParseNode> nodes = new ArrayList<>(); | |
66 | State state = State.buildNewState(); | |
67 | ||
68 | while (true) { | |
69 | int value = is.read(); | |
70 | // if the value is -1, there's nothing left to read | |
71 |
2
1. parse : negated conditional → KILLED 2. parse : replaced return value with Collections.emptyList for com/renomad/minum/htmlparsing/HtmlParser::parse → KILLED |
if (value == -1) return nodes; |
72 | ||
73 | char currentChar = (char) value; | |
74 |
1
1. parse : removed call to com/renomad/minum/htmlparsing/HtmlParser::processState → KILLED |
processState(currentChar, state, nodes); |
75 | } | |
76 | } | |
77 | ||
78 | /** | |
79 | * Use important symbols in the HTML code to indicate | |
80 | * which mode we are in - reading inside a tag, or between | |
81 | * tags. | |
82 | * <p> | |
83 | * Apologies to future readers. Hand-written parser code is the suck. | |
84 | * </p> | |
85 | * <p> | |
86 | * That said, there are plenty of tests exercising this, and it is | |
87 | * easy to test due to having been built using TDD. Cold comfort, I know. | |
88 | * </p> | |
89 | */ | |
90 | private void processState(char currentChar, State state, List<HtmlParseNode> nodes) { | |
91 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::recordLocation → KILLED |
recordLocation(currentChar, state); |
92 | ||
93 | // keep track of previous twelve characters, to check if inside comments and scripts | |
94 |
1
1. processState : removed call to com/renomad/minum/utils/RingBuffer::add → KILLED |
state.previousCharacters.add(currentChar); |
95 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::determineCommentState → KILLED |
determineCommentState(state); |
96 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::determineScriptState → KILLED |
determineScriptState(state); |
97 |
1
1. processState : negated conditional → KILLED |
if (state.isInsideComment) { |
98 | return; | |
99 | } | |
100 |
1
1. processState : negated conditional → KILLED |
if (state.isInsideScript) { |
101 | state.stringBuilder.append(currentChar); | |
102 | return; | |
103 | } | |
104 | ||
105 |
1
1. processState : negated conditional → KILLED |
if (currentChar == '<') { |
106 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::processLessThan → KILLED |
processLessThan(currentChar, state); |
107 |
1
1. processState : negated conditional → KILLED |
} else if (currentChar == '>') { |
108 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::processGreaterThan → KILLED |
processGreaterThan(currentChar, state, nodes); |
109 | } else { | |
110 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::addingToken → KILLED |
addingToken(state, currentChar); |
111 | } | |
112 | } | |
113 | ||
114 | /** | |
115 | * handle basic recording of stats, like row and column, | |
116 | * useful during error messages | |
117 | */ | |
118 | private static void recordLocation(char currentChar, State state) { | |
119 |
1
1. recordLocation : Replaced integer addition with subtraction → KILLED |
state.charsRead += 1; |
120 |
1
1. recordLocation : negated conditional → KILLED |
if (currentChar == '\n') { |
121 |
1
1. recordLocation : Replaced integer addition with subtraction → KILLED |
state.lineRow += 1; |
122 | state.lineColumn = 0; | |
123 | } | |
124 |
1
1. recordLocation : Replaced integer addition with subtraction → KILLED |
state.lineColumn += 1; |
125 | } | |
126 | ||
127 | private void processGreaterThan(char currentChar, State state, List<HtmlParseNode> nodes) { | |
128 | /* It's allowed to use greater-than signs in a lot of places */ | |
129 |
1
1. processGreaterThan : negated conditional → KILLED |
if (state.isInsideTag) { |
130 |
1
1. processGreaterThan : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleExitingTag → KILLED |
handleExitingTag(currentChar, state, nodes); |
131 | } else { | |
132 | /* | |
133 | This situation means we're looking at a | |
134 | free-floating greater-than symbol in | |
135 | the html text. | |
136 | */ | |
137 | state.stringBuilder.append(currentChar); | |
138 | } | |
139 | } | |
140 | ||
141 | /** | |
142 | * As we leave the tag, we make some decisions about it. | |
143 | */ | |
144 | private void handleExitingTag(char currentChar, State state, List<HtmlParseNode> nodes) { | |
145 |
1
1. handleExitingTag : negated conditional → KILLED |
if (state.isInsideAttributeValueQuoted) { |
146 | /* | |
147 | Here, we're looking at a greater-than | |
148 | that is inside a quoted attribute value | |
149 | */ | |
150 | state.stringBuilder.append(currentChar); | |
151 | } else { | |
152 |
1
1. handleExitingTag : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleTagComponents → KILLED |
handleTagComponents(state, nodes); |
153 | } | |
154 | } | |
155 | ||
156 | private void handleTagComponents(State state, List<HtmlParseNode> nodes) { | |
157 |
1
1. handleTagComponents : negated conditional → KILLED |
if (hasFinishedBuildingTagname(state.hasEncounteredTagName, state.tagName, state.stringBuilder)) { |
158 | state.tagName = state.stringBuilder.toString(); | |
159 |
3
1. handleTagComponents : negated conditional → KILLED 2. handleTagComponents : negated conditional → KILLED 3. handleTagComponents : negated conditional → KILLED |
} else if (!state.stringBuilder.isEmpty() && state.currentAttributeKey.isBlank() && state.isReadingAttributeKey) { |
160 | state.attributes.put(state.stringBuilder.toString(), ""); | |
161 | state.stringBuilder = new StringBuilder(); | |
162 | state.isReadingAttributeKey = false; | |
163 |
1
1. handleTagComponents : negated conditional → KILLED |
} else if (!state.currentAttributeKey.isBlank()) { |
164 | // if we were in the midst of reading attribute stuff when we hit the closing bracket... | |
165 |
1
1. handleTagComponents : negated conditional → KILLED |
if (!state.stringBuilder.isEmpty()) { |
166 | state.attributes.put(state.currentAttributeKey, state.stringBuilder.toString()); | |
167 | } else { | |
168 | state.attributes.put(state.currentAttributeKey, ""); | |
169 | } | |
170 | state.isInsideAttributeValueQuoted = false; | |
171 | state.stringBuilder = new StringBuilder(); | |
172 | state.currentAttributeKey = ""; | |
173 | } | |
174 | ||
175 |
1
1. handleTagComponents : removed call to com/renomad/minum/htmlparsing/HtmlParser::processTagAndResetState → KILLED |
processTagAndResetState(state, nodes); |
176 | } | |
177 | ||
178 | static boolean hasFinishedBuildingTagname(boolean hasEncounteredTagName, String tagName, StringBuilder sb) { | |
179 |
4
1. hasFinishedBuildingTagname : negated conditional → KILLED 2. hasFinishedBuildingTagname : negated conditional → KILLED 3. hasFinishedBuildingTagname : replaced boolean return with true for com/renomad/minum/htmlparsing/HtmlParser::hasFinishedBuildingTagname → KILLED 4. hasFinishedBuildingTagname : negated conditional → KILLED |
return hasEncounteredTagName && tagName.isEmpty() && !sb.isEmpty(); |
180 | } | |
181 | ||
182 | private void processLessThan(char currentChar, State state) { | |
183 | /* less-than signs are policed strictly */ | |
184 |
1
1. processLessThan : negated conditional → KILLED |
if (state.isInsideAttributeValueQuoted) { |
185 | /* | |
186 | Here, we're looking at a less-than that | |
187 | is inside a quoted attribute value | |
188 | */ | |
189 | state.stringBuilder.append(currentChar); | |
190 | } else { | |
191 |
1
1. processLessThan : removed call to com/renomad/minum/htmlparsing/HtmlParser::enteringTag → KILLED |
enteringTag(state); |
192 | } | |
193 | } | |
194 | ||
195 | /** | |
196 | * When we've read a less-than sign and are entering an HTML tag. | |
197 | */ | |
198 | private void enteringTag(State state) { | |
199 |
1
1. enteringTag : removed call to com/renomad/minum/htmlparsing/HtmlParser::addText → KILLED |
addText(state); |
200 | ||
201 | state.isInsideTag = true; | |
202 | /* | |
203 | not really sure it's a start tag, but if we | |
204 | assume it is that's fine, because if we hit | |
205 | a forward slash at the beginning, it becomes | |
206 | a non-start-tag. | |
207 | */ | |
208 | state.isStartTag = true; | |
209 | state.stringBuilder = new StringBuilder(); | |
210 | } | |
211 | ||
212 | private static void addText(State state) { | |
213 |
1
1. addText : negated conditional → KILLED |
if (!state.stringBuilder.isEmpty()) { |
214 | ||
215 | String textContent = state.stringBuilder.toString(); | |
216 | ||
217 | // This is where we add characters if we found any between tags. | |
218 |
2
1. addText : negated conditional → KILLED 2. addText : negated conditional → KILLED |
if (! state.parseStack.isEmpty() && ! textContent.isBlank()) { |
219 |
1
1. addText : removed call to com/renomad/minum/htmlparsing/HtmlParseNode::addToInnerContent → KILLED |
state.parseStack.peek().addToInnerContent(new HtmlParseNode(ParseNodeType.CHARACTERS, TagInfo.EMPTY, new ArrayList<>(), textContent)); |
220 | } | |
221 | } | |
222 | } | |
223 | ||
224 | /** | |
225 | * Called when we've just hit a greater-than sign and thus | |
226 | * exited an HTML tag. | |
227 | */ | |
228 | private void processTagAndResetState(State state, List<HtmlParseNode> nodes) { | |
229 |
1
1. processTagAndResetState : removed call to com/renomad/minum/htmlparsing/HtmlParser::processTag → KILLED |
processTag(state, nodes); |
230 | ||
231 | state.isHalfClosedTag = false; | |
232 | state.isInsideTag = false; | |
233 | state.isStartTag = false; | |
234 | state.isReadingTagName = false; | |
235 | state.tagName = ""; | |
236 | state.attributes = new HashMap<>(); | |
237 | state.hasEncounteredTagName = false; | |
238 | state.stringBuilder = new StringBuilder(); | |
239 | } | |
240 | ||
241 | /** | |
242 | * The commonest case when reading characters. Buckle up. | |
243 | */ | |
244 | private void addingToken(State state, char currentChar) { | |
245 |
2
1. addingToken : negated conditional → KILLED 2. addingToken : negated conditional → KILLED |
var hasNotBegunReadingTagName = state.isInsideTag && !state.hasEncounteredTagName; |
246 | ||
247 |
1
1. addingToken : negated conditional → KILLED |
if (hasNotBegunReadingTagName) { |
248 |
1
1. addingToken : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleBeforeReadingTagName → KILLED |
handleBeforeReadingTagName(state, currentChar); |
249 |
1
1. addingToken : negated conditional → KILLED |
} else if (state.isReadingTagName) { |
250 |
1
1. addingToken : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleReadingTagName → KILLED |
handleReadingTagName(state, currentChar); |
251 |
1
1. addingToken : negated conditional → KILLED |
} else if (isFinishedReadingTag(state.tagName, state.isInsideTag)) { |
252 |
1
1. addingToken : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleAfterReadingTagName → KILLED |
handleAfterReadingTagName(state, currentChar); |
253 | } else { | |
254 | state.stringBuilder.append(currentChar); | |
255 | } | |
256 | } | |
257 | ||
258 | static boolean isFinishedReadingTag(String tagName, boolean isInsideTag) { | |
259 |
3
1. isFinishedReadingTag : negated conditional → TIMED_OUT 2. isFinishedReadingTag : replaced boolean return with true for com/renomad/minum/htmlparsing/HtmlParser::isFinishedReadingTag → KILLED 3. isFinishedReadingTag : negated conditional → KILLED |
return !tagName.isEmpty() && isInsideTag; |
260 | } | |
261 | ||
262 | static final List<Character> startOfComment = List.of('<', '!', '-', '-'); | |
263 | static final List<Character> endOfComment = List.of('-', '-', '>'); | |
264 | ||
265 | /** | |
266 | * Returns whether we are inside an HTML comment, | |
267 | * that is {@code <!-- -->} | |
268 | */ | |
269 | private void determineCommentState(State state) { | |
270 | boolean atCommentStart = state.previousCharacters.containsAt(startOfComment, 8); | |
271 | boolean atCommentEnd = state.previousCharacters.containsAt(endOfComment, 8); | |
272 | boolean isInsideTag = state.isInsideTag; | |
273 | boolean hasEncounteredTagName = state.hasEncounteredTagName; | |
274 |
3
1. determineCommentState : negated conditional → KILLED 2. determineCommentState : negated conditional → KILLED 3. determineCommentState : negated conditional → KILLED |
if (isInsideTag && !hasEncounteredTagName && atCommentStart) { |
275 | state.isInsideComment = true; | |
276 | state.isInsideTag = false; | |
277 |
2
1. determineCommentState : negated conditional → KILLED 2. determineCommentState : negated conditional → KILLED |
} else if (state.isInsideComment && atCommentEnd) { |
278 | state.isInsideComment = false; | |
279 | } | |
280 | } | |
281 | ||
282 | static final List<Character> scriptElement = List.of('<','/','s','c','r','i','p','t','>'); | |
283 | ||
284 | /** | |
285 | * Determines whether we have hit the end of the script block | |
286 | * by looking for the closing script tag. | |
287 | */ | |
288 | private void determineScriptState(State state) { | |
289 | boolean isScriptFinished = state.previousCharacters.containsAt(scriptElement, 3); | |
290 | boolean wasInsideScript = state.isInsideScript; | |
291 |
2
1. determineScriptState : negated conditional → KILLED 2. determineScriptState : negated conditional → KILLED |
state.isInsideScript = state.isInsideScript && !isScriptFinished; |
292 |
2
1. determineScriptState : negated conditional → KILLED 2. determineScriptState : negated conditional → KILLED |
boolean justClosedScriptTag = wasInsideScript && !state.isInsideScript; |
293 |
1
1. determineScriptState : negated conditional → KILLED |
if (justClosedScriptTag) { |
294 | state.tagName = "script"; | |
295 | state.isInsideTag = true; | |
296 | state.isStartTag = false; | |
297 | var innerTextLength = state.stringBuilder.length(); | |
298 |
1
1. determineScriptState : Replaced integer subtraction with addition → KILLED |
state.stringBuilder.delete(innerTextLength - 8, innerTextLength); |
299 |
1
1. determineScriptState : removed call to com/renomad/minum/htmlparsing/HtmlParser::addText → KILLED |
addText(state); |
300 | } | |
301 | ||
302 | } | |
303 | ||
304 | /** | |
305 | * at this point we have a tagname for our tag, and we're still in the tag | |
306 | */ | |
307 | private static void handleAfterReadingTagName(State state, char currentChar) { | |
308 | ||
309 | boolean isHandlingAttributes = isHandlingAttributes(state, currentChar); | |
310 |
1
1. handleAfterReadingTagName : negated conditional → KILLED |
if (isHandlingAttributes) { |
311 | ||
312 |
1
1. handleAfterReadingTagName : negated conditional → KILLED |
if (state.currentAttributeKey.isBlank()) { |
313 | /* | |
314 | because the key is blank, we know we haven't read it all. That's | |
315 | because when we finish reading the key, we'll add it to currentAttributeKey | |
316 | and be in the mode of reading the value. | |
317 | */ | |
318 |
1
1. handleAfterReadingTagName : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleNotFullyReadAttributeKey → KILLED |
handleNotFullyReadAttributeKey(state, currentChar); |
319 | } else { | |
320 | // reading in the (potential) attribute value | |
321 | ||
322 |
1
1. handleAfterReadingTagName : removed call to com/renomad/minum/htmlparsing/HtmlParser::handlePotentialAttributeValue → KILLED |
handlePotentialAttributeValue(state, currentChar); |
323 | } | |
324 | } | |
325 | } | |
326 | ||
327 | /** | |
328 | * Check whether we're past the whitespace between the tag name and | |
329 | * the start of the (potential) attribute key. | |
330 | */ | |
331 | static boolean isHandlingAttributes(State state, char currentChar) { | |
332 |
2
1. isHandlingAttributes : replaced boolean return with true for com/renomad/minum/htmlparsing/HtmlParser::isHandlingAttributes → KILLED 2. isHandlingAttributes : negated conditional → KILLED |
return ! (state.currentAttributeKey.isEmpty() && |
333 |
2
1. isHandlingAttributes : negated conditional → KILLED 2. isHandlingAttributes : negated conditional → KILLED |
state.stringBuilder.isEmpty() |
334 | && currentChar == ' '); | |
335 | } | |
336 | ||
337 | private static void handlePotentialAttributeValue(State state, char currentChar) { | |
338 |
1
1. handlePotentialAttributeValue : negated conditional → KILLED |
if (state.isInsideAttributeValueQuoted) { |
339 | // if we're already inside a quoted area, encountering a | |
340 | // closing quote will take us out of it. | |
341 |
1
1. handlePotentialAttributeValue : negated conditional → KILLED |
if (currentChar == state.quoteType.literal) { |
342 | // if we hit the matching end-quote, switch modes | |
343 | state.isInsideAttributeValueQuoted = false; | |
344 | state.quoteType = QuoteType.NONE; | |
345 | state.attributes.put(state.currentAttributeKey, state.stringBuilder.toString()); | |
346 | state.stringBuilder = new StringBuilder(); | |
347 | state.currentAttributeKey = ""; | |
348 | state.isReadingAttributeKey = false; | |
349 | } else { | |
350 | // otherwise keep on trucking, adding characters | |
351 | state.stringBuilder.append(currentChar); | |
352 | } | |
353 | } else { | |
354 |
2
1. handlePotentialAttributeValue : negated conditional → KILLED 2. handlePotentialAttributeValue : negated conditional → KILLED |
if (currentChar == '"' || currentChar == '\'') { |
355 | /* | |
356 | if we're not currently inside a quoted area but encounter | |
357 | a quote, switch modes. | |
358 | */ | |
359 | state.isInsideAttributeValueQuoted = true; | |
360 | state.quoteType = QuoteType.byLiteral(currentChar); | |
361 |
2
1. handlePotentialAttributeValue : negated conditional → KILLED 2. handlePotentialAttributeValue : negated conditional → KILLED |
} else if (!state.stringBuilder.isEmpty() && currentChar == ' ') { |
362 | /* | |
363 | if we're not in a quoted area and encounter a space, then | |
364 | we're done reading the attribute value and can add the key-value | |
365 | pair to the map. | |
366 | */ | |
367 | state.attributes.put(state.currentAttributeKey, state.stringBuilder.toString()); | |
368 | state.isReadingAttributeKey = false; | |
369 | state.stringBuilder = new StringBuilder(); | |
370 | state.currentAttributeKey = ""; | |
371 | } else { | |
372 | // otherwise keep trucking along adding characters | |
373 | state.stringBuilder.append(currentChar); | |
374 | } | |
375 | } | |
376 | } | |
377 | ||
378 | private static void handleNotFullyReadAttributeKey(State state, char currentChar) { | |
379 |
1
1. handleNotFullyReadAttributeKey : negated conditional → KILLED |
if (state.isHalfClosedTag) { |
380 | /* | |
381 | This situation occurs when we are in a void tag, like <link />, | |
382 | and are closing the tag with a forward slash + closing bracket. | |
383 | ||
384 | if we got here, it means the previous char was | |
385 | a forward slash, so the current character *should* | |
386 | be a closing angle, but if it's not ... | |
387 | */ | |
388 | throw new ParsingException(String.format("in closing a void tag (e.g. <link />), character after forward slash must be angle bracket. Char: %s at line %d and at the %d character. %d chars read in total.", currentChar, state.lineRow, state.lineColumn, state.charsRead)); | |
389 |
2
1. handleNotFullyReadAttributeKey : negated conditional → KILLED 2. handleNotFullyReadAttributeKey : negated conditional → KILLED |
} else if (currentChar == ' ' || currentChar == '=') { |
390 | // if we hit whitespace or an equals sign, we're done reading the key | |
391 | state.currentAttributeKey = state.stringBuilder.toString(); | |
392 | state.isReadingAttributeKey = false; | |
393 | state.stringBuilder = new StringBuilder(); | |
394 |
1
1. handleNotFullyReadAttributeKey : negated conditional → KILLED |
} else if (currentChar == '/') { |
395 | // a forward-slash cannot be in the attribute key | |
396 | state.isReadingAttributeKey = false; | |
397 | state.isHalfClosedTag = true; | |
398 | } else { | |
399 | // otherwise keep on reading | |
400 | state.stringBuilder.append(currentChar); | |
401 | // and note we are reading the key | |
402 | state.isReadingAttributeKey = true; | |
403 | } | |
404 | } | |
405 | ||
406 | private static void handleReadingTagName(State state, char currentChar) { | |
407 |
1
1. handleReadingTagName : negated conditional → KILLED |
if (Character.isWhitespace(currentChar)) { |
408 | /* | |
409 | At this point, we've been reading the tag name, and we've encountered whitespace. | |
410 | That means we are done reading the tag name | |
411 | */ | |
412 | state.hasEncounteredTagName = true; | |
413 | state.isReadingTagName = false; | |
414 | state.tagName = state.stringBuilder.toString(); | |
415 | state.attributes = new HashMap<>(); | |
416 | state.stringBuilder = new StringBuilder(); | |
417 | } else { | |
418 | /* | |
419 | Reading the characters of the tag name | |
420 | */ | |
421 | state.hasEncounteredTagName = true; | |
422 | state.tagName = ""; | |
423 | state.stringBuilder.append(currentChar); | |
424 | } | |
425 | } | |
426 | ||
427 | /** | |
428 | * We're just past a starting angle bracket, so we're | |
429 | * feeling our way around what this element is. | |
430 | */ | |
431 | private static void handleBeforeReadingTagName(State state, char currentChar) { | |
432 |
1
1. handleBeforeReadingTagName : negated conditional → KILLED |
if (currentChar == ' ') { |
433 | /* | |
434 | At this point, we're inside the tag, and we've encountered whitespace. | |
435 | Seeking the tag name (although we may be inside a closing tag). | |
436 | */ | |
437 | state.stringBuilder = new StringBuilder(); | |
438 |
1
1. handleBeforeReadingTagName : negated conditional → KILLED |
} else if (currentChar == '/') { |
439 | /* | |
440 | hitting a forward-slash symbol means we're looking | |
441 | at the closure of a tag | |
442 | */ | |
443 | state.isStartTag = false; | |
444 | state.stringBuilder = new StringBuilder(); | |
445 |
1
1. handleBeforeReadingTagName : negated conditional → KILLED |
} else if (Character.isAlphabetic(currentChar)) { |
446 | ||
447 | /* | |
448 | Here, our input could definitely be the letters of a tag name | |
449 | */ | |
450 | state.hasEncounteredTagName = true; | |
451 | state.isReadingTagName = true; | |
452 | state.stringBuilder.append(currentChar); | |
453 | } | |
454 | } | |
455 | ||
456 | /** | |
457 | * This examines the results of reading a tag - if it's | |
458 | * a start tag, it pushes it onto a stack for later | |
459 | * comparison to the end tag. The stack is a key | |
460 | * component of how we are able to nest the tags properly. | |
461 | */ | |
462 | private void processTag(State state, List<HtmlParseNode> nodes) { | |
463 | String tagNameString = state.tagName; | |
464 | TagName tagName; | |
465 | ||
466 | tagName = TagName.findMatchingTagname(tagNameString); | |
467 |
1
1. processTag : negated conditional → KILLED |
if (tagName.equals(TagName.UNRECOGNIZED)) return; |
468 | var tagInfo = new TagInfo(tagName, state.attributes); | |
469 |
1
1. processTag : negated conditional → KILLED |
if (state.isStartTag) { |
470 | HtmlParseNode newNode = new HtmlParseNode(ParseNodeType.ELEMENT, tagInfo, new ArrayList<>(), ""); | |
471 | ||
472 |
1
1. processTag : negated conditional → KILLED |
if (! state.parseStack.isEmpty()) { |
473 | // if we're inside an html element, | |
474 | // add this to the inner content | |
475 |
1
1. processTag : removed call to com/renomad/minum/htmlparsing/HtmlParseNode::addToInnerContent → KILLED |
state.parseStack.peek().addToInnerContent(newNode); |
476 | } | |
477 | ||
478 |
2
1. processTag : negated conditional → KILLED 2. processTag : negated conditional → KILLED |
if (state.parseStack.isEmpty() && tagName.isVoidElement) { |
479 | // if we're at the root level and encountering a void element, | |
480 | // add it to the root-level list of nodes | |
481 | nodes.add(newNode); | |
482 |
1
1. processTag : negated conditional → KILLED |
} else if (!tagName.isVoidElement) { |
483 |
1
1. processTag : removed call to java/util/Deque::push → KILLED |
state.parseStack.push(newNode); |
484 | } | |
485 | ||
486 |
1
1. processTag : negated conditional → KILLED |
if (tagName.equals(TagName.SCRIPT)) { |
487 | state.isInsideScript = true; | |
488 | state.stringBuilder = new StringBuilder(); | |
489 | } | |
490 | } else { | |
491 | // if we're leaving an end-tag, it means we have a | |
492 | // full element with potentially inner content | |
493 | HtmlParseNode htmlParseNode; | |
494 | try { | |
495 | htmlParseNode = state.parseStack.pop(); | |
496 | } catch (NoSuchElementException ex) { | |
497 | throw new ParsingException("No starting tag found. At line " + state.lineRow + " and at the " + state.lineColumn + "th character. " + state.charsRead + " characters read in total."); | |
498 | } | |
499 | ||
500 | /* | |
501 | If the stack is a size of zero at this point, it means we're at the | |
502 | roots of our HTML code, which means it's the proper time to add the | |
503 | topmost element we just popped into a list. | |
504 | */ | |
505 |
1
1. processTag : negated conditional → KILLED |
if (state.parseStack.isEmpty()) { |
506 | nodes.add(htmlParseNode); | |
507 | } | |
508 | TagName expectedTagName = htmlParseNode.getTagInfo().getTagName(); | |
509 |
1
1. processTag : negated conditional → KILLED |
if (expectedTagName != tagName) { |
510 | throw new ParsingException("Did not find expected closing-tag type. " + "Expected: " + expectedTagName + " at line " + state.lineRow + " and at the " + state.lineColumn + "th character. " + state.charsRead + " characters read in total."); | |
511 | } | |
512 | } | |
513 | } | |
514 | ||
515 | enum QuoteType { | |
516 | SINGLE_QUOTED('\''), DOUBLE_QUOTED('"'), NONE(Character.MIN_VALUE); | |
517 | ||
518 | public final char literal; | |
519 | ||
520 | QuoteType(char literal) { | |
521 | this.literal = literal; | |
522 | } | |
523 | ||
524 | public static QuoteType byLiteral(char currentChar) { | |
525 |
1
1. byLiteral : negated conditional → KILLED |
if (currentChar == '\'') { |
526 |
1
1. byLiteral : replaced return value with null for com/renomad/minum/htmlparsing/HtmlParser$QuoteType::byLiteral → KILLED |
return QuoteType.SINGLE_QUOTED; |
527 | } else { | |
528 |
1
1. byLiteral : replaced return value with null for com/renomad/minum/htmlparsing/HtmlParser$QuoteType::byLiteral → KILLED |
return QuoteType.DOUBLE_QUOTED; |
529 | } | |
530 | } | |
531 | } | |
532 | ||
533 | static class State { | |
534 | ||
535 | static State buildNewState() { | |
536 | RingBuffer<Character> previousCharacters = new RingBuffer<>(12, Character.class); | |
537 | int lineColumn1 = 0; | |
538 | int lineRow1 = 1; | |
539 | boolean isHalfClosedTag1 = false; | |
540 | boolean isInsideAttributeValueQuoted1 = false; | |
541 | boolean isStartTag1 = true; | |
542 | boolean isReadingTagName1 = false; | |
543 | boolean hasEncounteredTagName1 = false; | |
544 | ArrayDeque<HtmlParseNode> parseStack1 = new ArrayDeque<>(); | |
545 | StringBuilder stringBuilder1 = new StringBuilder(); | |
546 | boolean isInsideTag1 = false; | |
547 | int charsRead1 = 0; | |
548 | String tagName1 = ""; | |
549 | String currentAttributeKey1 = ""; | |
550 | HashMap<String, String> attributes1 = new HashMap<>(); | |
551 | boolean isReadingAttributeKey1 = false; | |
552 | boolean isInsideComment1 = false; | |
553 | boolean isInsideScript1 = false; | |
554 |
1
1. buildNewState : replaced return value with null for com/renomad/minum/htmlparsing/HtmlParser$State::buildNewState → KILLED |
return new State(charsRead1, isInsideTag1, stringBuilder1, parseStack1, hasEncounteredTagName1, |
555 | isReadingTagName1, isStartTag1, isInsideAttributeValueQuoted1, | |
556 | tagName1, currentAttributeKey1, attributes1, QuoteType.NONE, isReadingAttributeKey1, | |
557 | isHalfClosedTag1, lineRow1, lineColumn1, previousCharacters, isInsideComment1, isInsideScript1); | |
558 | } | |
559 | ||
560 | /** | |
561 | * If we encounter a forward-slash in a tag, and we're | |
562 | * not in the midst of reading an attribute value, then | |
563 | * we expect the next character to be a greater-than symbol. | |
564 | */ | |
565 | boolean isHalfClosedTag; | |
566 | /** | |
567 | * total number of chars read of this HTML file | |
568 | */ | |
569 | int charsRead; | |
570 | /** | |
571 | * True if we are inside angle brackets (may be a closing tag) | |
572 | */ | |
573 | boolean isInsideTag; | |
574 | /** | |
575 | * Where we build up tokens a character at a time | |
576 | */ | |
577 | StringBuilder stringBuilder; | |
578 | /** | |
579 | * A stack of HtmlParseNodes, used to see how far deep in the tree we are | |
580 | */ | |
581 | final Deque<HtmlParseNode> parseStack; | |
582 | /** | |
583 | * True if we have successfully encountered the first letter of the tag | |
584 | */ | |
585 | boolean hasEncounteredTagName; | |
586 | /** | |
587 | * True if we are in the process of reading the tag (e.g. p, a, h1, etc) | |
588 | */ | |
589 | boolean isReadingTagName; | |
590 | ||
591 | /** | |
592 | * if we determine we are in the midst of reading an attribute key | |
593 | */ | |
594 | boolean isReadingAttributeKey; | |
595 | ||
596 | /** | |
597 | * True if we determine we are probably in the start tag (rather than the closing tag) | |
598 | */ | |
599 | boolean isStartTag; | |
600 | /** | |
601 | * True if we're inside the quoted area inside an attribute value in an element | |
602 | * tag - this could be where we encounter some symbols that may not be allowed elsewhere. | |
603 | */ | |
604 | boolean isInsideAttributeValueQuoted; | |
605 | /** | |
606 | * If we're in a quoted area, it's either single or double-quoted. | |
607 | * These quotes need to be paired properly, so we need to keep track. | |
608 | */ | |
609 | QuoteType quoteType; | |
610 | /** | |
611 | * The string value of the tag name | |
612 | */ | |
613 | String tagName; | |
614 | /** | |
615 | * The attribute key we just read | |
616 | */ | |
617 | String currentAttributeKey; | |
618 | /** | |
619 | * a map of string to values (in some cases there won't be an equals | |
620 | * sign, meaning the value is null. In other cases there will be an | |
621 | * equals sign but no value, meaning the value is empty string) | |
622 | */ | |
623 | Map<String, String> attributes; | |
624 | ||
625 | /** | |
626 | * indicate which line we're on in debugging | |
627 | */ | |
628 | int lineRow; | |
629 | ||
630 | /** | |
631 | * How far we are from the last newline character, including | |
632 | * all whitespace as well. | |
633 | */ | |
634 | int lineColumn; | |
635 | ||
636 | /** | |
637 | * This is used to check for comments and script tags, like: | |
638 | * {@code <!-- -->} and {@code <script>} | |
639 | */ | |
640 | final RingBuffer<Character> previousCharacters; | |
641 | ||
642 | /** | |
643 | * Indicates whether we are inside a comment | |
644 | */ | |
645 | boolean isInsideComment; | |
646 | ||
647 | boolean isInsideScript; | |
648 | ||
649 | /** | |
650 | * Holds the state so we can remember where we are as we examine the HTML | |
651 | * a character at a time. | |
652 | */ | |
653 | public State(int charsRead, boolean isInsideTag, StringBuilder stringBuilder, | |
654 | Deque<HtmlParseNode> parseStack, boolean hasEncounteredTagName, boolean isReadingTagName, | |
655 | boolean isStartTag, boolean isInsideAttributeValueQuoted, String tagName, | |
656 | String currentAttributeKey, Map<String, String> attributes, QuoteType quoteType, | |
657 | boolean isReadingAttributeKey, boolean isHalfClosedTag, int lineRow, int lineColumn, | |
658 | RingBuffer<Character> previousCharacters, boolean isInsideComment, boolean isInsideScript) { | |
659 | ||
660 | this.charsRead = charsRead; | |
661 | this.isInsideTag = isInsideTag; | |
662 | this.stringBuilder = stringBuilder; | |
663 | this.parseStack = parseStack; | |
664 | this.hasEncounteredTagName = hasEncounteredTagName; | |
665 | this.isReadingTagName = isReadingTagName; | |
666 | this.isStartTag = isStartTag; | |
667 | this.isInsideAttributeValueQuoted = isInsideAttributeValueQuoted; | |
668 | this.tagName = tagName; | |
669 | this.currentAttributeKey = currentAttributeKey; | |
670 | this.attributes = attributes; | |
671 | this.quoteType = quoteType; | |
672 | this.isReadingAttributeKey = isReadingAttributeKey; | |
673 | this.isHalfClosedTag = isHalfClosedTag; | |
674 | this.lineRow = lineRow; | |
675 | this.lineColumn = lineColumn; | |
676 | this.previousCharacters = previousCharacters; | |
677 | this.isInsideComment = isInsideComment; | |
678 | this.isInsideScript = isInsideScript; | |
679 | } | |
680 | } | |
681 | ||
682 | /** | |
683 | * Search the node tree for matching elements. | |
684 | * <p> | |
685 | * If zero nodes are found, returns an empty list. | |
686 | * </p> | |
687 | */ | |
688 | public List<HtmlParseNode> search(List<HtmlParseNode> nodes, TagName tagName, Map<String, String> attributes) { | |
689 | List<HtmlParseNode> foundNodes = new ArrayList<>(); | |
690 | for (var node : nodes) { | |
691 | var result = node.search(tagName, attributes); | |
692 | foundNodes.addAll(result); | |
693 | } | |
694 |
1
1. search : replaced return value with Collections.emptyList for com/renomad/minum/htmlparsing/HtmlParser::search → KILLED |
return foundNodes; |
695 | } | |
696 | ||
697 | } | |
Mutations | ||
61 |
1.1 2.2 |
|
71 |
1.1 2.2 |
|
74 |
1.1 |
|
91 |
1.1 |
|
94 |
1.1 |
|
95 |
1.1 |
|
96 |
1.1 |
|
97 |
1.1 |
|
100 |
1.1 |
|
105 |
1.1 |
|
106 |
1.1 |
|
107 |
1.1 |
|
108 |
1.1 |
|
110 |
1.1 |
|
119 |
1.1 |
|
120 |
1.1 |
|
121 |
1.1 |
|
124 |
1.1 |
|
129 |
1.1 |
|
130 |
1.1 |
|
145 |
1.1 |
|
152 |
1.1 |
|
157 |
1.1 |
|
159 |
1.1 2.2 3.3 |
|
163 |
1.1 |
|
165 |
1.1 |
|
175 |
1.1 |
|
179 |
1.1 2.2 3.3 4.4 |
|
184 |
1.1 |
|
191 |
1.1 |
|
199 |
1.1 |
|
213 |
1.1 |
|
218 |
1.1 2.2 |
|
219 |
1.1 |
|
229 |
1.1 |
|
245 |
1.1 2.2 |
|
247 |
1.1 |
|
248 |
1.1 |
|
249 |
1.1 |
|
250 |
1.1 |
|
251 |
1.1 |
|
252 |
1.1 |
|
259 |
1.1 2.2 3.3 |
|
274 |
1.1 2.2 3.3 |
|
277 |
1.1 2.2 |
|
291 |
1.1 2.2 |
|
292 |
1.1 2.2 |
|
293 |
1.1 |
|
298 |
1.1 |
|
299 |
1.1 |
|
310 |
1.1 |
|
312 |
1.1 |
|
318 |
1.1 |
|
322 |
1.1 |
|
332 |
1.1 2.2 |
|
333 |
1.1 2.2 |
|
338 |
1.1 |
|
341 |
1.1 |
|
354 |
1.1 2.2 |
|
361 |
1.1 2.2 |
|
379 |
1.1 |
|
389 |
1.1 2.2 |
|
394 |
1.1 |
|
407 |
1.1 |
|
432 |
1.1 |
|
438 |
1.1 |
|
445 |
1.1 |
|
467 |
1.1 |
|
469 |
1.1 |
|
472 |
1.1 |
|
475 |
1.1 |
|
478 |
1.1 2.2 |
|
482 |
1.1 |
|
483 |
1.1 |
|
486 |
1.1 |
|
505 |
1.1 |
|
509 |
1.1 |
|
525 |
1.1 |
|
526 |
1.1 |
|
528 |
1.1 |
|
554 |
1.1 |
|
694 |
1.1 |