| 1 | package com.renomad.minum.htmlparsing; | |
| 2 | ||
| 3 | import com.renomad.minum.security.ForbiddenUseException; | |
| 4 | import com.renomad.minum.utils.RingBuffer; | |
| 5 | ||
| 6 | import java.io.ByteArrayInputStream; | |
| 7 | import java.nio.charset.StandardCharsets; | |
| 8 | import java.util.*; | |
| 9 | ||
| 10 | /** | |
| 11 | * Converts HTML strings to object trees. | |
| 12 | * <p> | |
| 13 | * Enables a developer to analyze an HTML document by its | |
| 14 | * structure. | |
| 15 | * </p> | |
| 16 | * <p> | |
| 17 | * Note: HTML parsing is difficult because | |
| 18 | * of its lenient specification. See Postel's Law. | |
| 19 | * </p> | |
| 20 | * <p> | |
| 21 | * For our purposes, it is less important | |
| 22 | * to perfectly meet the criteria of the spec, so | |
| 23 | * there will be numerous edge-cases unaccounted-for | |
| 24 | * by this implementation. Nevertheless, this program | |
| 25 | * should suit many needs for ordinary web applications. | |
| 26 | * </p> | |
| 27 | */ | |
| 28 | public final class HtmlParser { | |
| 29 | ||
| 30 | /** | |
| 31 | * Most total chars we'll read. | |
| 32 | */ | |
| 33 | static final int MAX_HTML_SIZE = 2 * 1024 * 1024; | |
| 34 | ||
| 35 | /** | |
| 36 | * Given any HTML input, scan through and generate a tree | |
| 37 | * of HTML nodes. Return a list of the roots of the tree. | |
| 38 | * <p> | |
| 39 | * This parser operates with a very particular paradigm in mind. I'll explain | |
| 40 | * it through examples. Let's look at some typical HTML: | |
| 41 | * </p> | |
| 42 | * <pre>{@code <p>Hello world</p>}</pre> | |
| 43 | * <p> | |
| 44 | * The way we will model this is as follows: | |
| 45 | * </p> | |
| 46 | * <pre>{@code <ELEMENT_NAME_AND_DETAILS>content<END_OF_ELEMENT>}</pre> | |
| 47 | * <p> | |
| 48 | * We will examine the first part, "ELEMENT_NAME_AND_DETAILS", and | |
| 49 | * grab the element's name and any attributes. Then we will descend into the | |
| 50 | * content section. We know we have hit the end of the element by keeping | |
| 51 | * track of how far we have descended/ascended and whether we are hitting | |
| 52 | * a closing HTML element. | |
| 53 | * </p> | |
| 54 | * <p> | |
| 55 | * Complicating this is that elements may not have content, for example | |
| 56 | * any <a href="https://www.w3.org/TR/2011/WD-html-markup-20110113/syntax.html#void-element_xref3">void elements</a> | |
| 57 | * or when a user chooses to create an empty tag | |
| 58 | * </p> | |
| 59 | */ | |
| 60 | public List<HtmlParseNode> parse(String input) { | |
| 61 |
2
1. parse : changed conditional boundary → KILLED 2. parse : negated conditional → KILLED |
if (input.length() > MAX_HTML_SIZE) |
| 62 | throw new ForbiddenUseException("Input exceeds max allowed HTML text size, " + MAX_HTML_SIZE + " chars"); | |
| 63 | var is = new ByteArrayInputStream(input.getBytes(StandardCharsets.UTF_8)); | |
| 64 | ||
| 65 | List<HtmlParseNode> nodes = new ArrayList<>(); | |
| 66 | State state = State.buildNewState(); | |
| 67 | ||
| 68 | while (true) { | |
| 69 | int value = is.read(); | |
| 70 | // if the value is -1, there's nothing left to read | |
| 71 |
2
1. parse : negated conditional → KILLED 2. parse : replaced return value with Collections.emptyList for com/renomad/minum/htmlparsing/HtmlParser::parse → KILLED |
if (value == -1) return nodes; |
| 72 | ||
| 73 | char currentChar = (char) value; | |
| 74 |
1
1. parse : removed call to com/renomad/minum/htmlparsing/HtmlParser::processState → KILLED |
processState(currentChar, state, nodes); |
| 75 | } | |
| 76 | } | |
| 77 | ||
| 78 | /** | |
| 79 | * Use important symbols in the HTML code to indicate | |
| 80 | * which mode we are in - reading inside a tag, or between | |
| 81 | * tags. | |
| 82 | * <p> | |
| 83 | * Apologies to future readers. Hand-written parser code is the suck. | |
| 84 | * </p> | |
| 85 | * <p> | |
| 86 | * That said, there are plenty of tests exercising this, and it is | |
| 87 | * easy to test due to having been built using TDD. Cold comfort, I know. | |
| 88 | * </p> | |
| 89 | */ | |
| 90 | private void processState(char currentChar, State state, List<HtmlParseNode> nodes) { | |
| 91 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::recordLocation → KILLED |
recordLocation(currentChar, state); |
| 92 | ||
| 93 | // keep track of previous twelve characters, to check if inside comments and scripts | |
| 94 |
1
1. processState : removed call to com/renomad/minum/utils/RingBuffer::add → KILLED |
state.previousCharacters.add(currentChar); |
| 95 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::determineCommentState → KILLED |
determineCommentState(state); |
| 96 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::determineScriptState → KILLED |
determineScriptState(state); |
| 97 |
1
1. processState : negated conditional → KILLED |
if (state.isInsideComment) { |
| 98 | return; | |
| 99 | } | |
| 100 |
1
1. processState : negated conditional → KILLED |
if (state.isInsideScript) { |
| 101 | state.stringBuilder.append(currentChar); | |
| 102 | return; | |
| 103 | } | |
| 104 | ||
| 105 |
1
1. processState : negated conditional → KILLED |
if (currentChar == '<') { |
| 106 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::processLessThan → KILLED |
processLessThan(currentChar, state); |
| 107 |
1
1. processState : negated conditional → KILLED |
} else if (currentChar == '>') { |
| 108 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::processGreaterThan → KILLED |
processGreaterThan(currentChar, state, nodes); |
| 109 | } else { | |
| 110 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::addingToken → KILLED |
addingToken(state, currentChar); |
| 111 | } | |
| 112 | } | |
| 113 | ||
| 114 | /** | |
| 115 | * handle basic recording of stats, like row and column, | |
| 116 | * useful during error messages | |
| 117 | */ | |
| 118 | private static void recordLocation(char currentChar, State state) { | |
| 119 |
1
1. recordLocation : Replaced integer addition with subtraction → KILLED |
state.charsRead += 1; |
| 120 |
1
1. recordLocation : negated conditional → KILLED |
if (currentChar == '\n') { |
| 121 |
1
1. recordLocation : Replaced integer addition with subtraction → KILLED |
state.lineRow += 1; |
| 122 | state.lineColumn = 0; | |
| 123 | } | |
| 124 |
1
1. recordLocation : Replaced integer addition with subtraction → KILLED |
state.lineColumn += 1; |
| 125 | } | |
| 126 | ||
| 127 | private void processGreaterThan(char currentChar, State state, List<HtmlParseNode> nodes) { | |
| 128 | /* It's allowed to use greater-than signs in a lot of places */ | |
| 129 |
1
1. processGreaterThan : negated conditional → KILLED |
if (state.isInsideTag) { |
| 130 |
1
1. processGreaterThan : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleExitingTag → KILLED |
handleExitingTag(currentChar, state, nodes); |
| 131 | } else { | |
| 132 | /* | |
| 133 | This situation means we're looking at a | |
| 134 | free-floating greater-than symbol in | |
| 135 | the html text. | |
| 136 | */ | |
| 137 | state.stringBuilder.append(currentChar); | |
| 138 | } | |
| 139 | } | |
| 140 | ||
| 141 | /** | |
| 142 | * As we leave the tag, we make some decisions about it. | |
| 143 | */ | |
| 144 | private void handleExitingTag(char currentChar, State state, List<HtmlParseNode> nodes) { | |
| 145 |
1
1. handleExitingTag : negated conditional → KILLED |
if (state.isInsideAttributeValueQuoted) { |
| 146 | /* | |
| 147 | Here, we're looking at a greater-than | |
| 148 | that is inside a quoted attribute value | |
| 149 | */ | |
| 150 | state.stringBuilder.append(currentChar); | |
| 151 | } else { | |
| 152 |
1
1. handleExitingTag : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleTagComponents → KILLED |
handleTagComponents(state, nodes); |
| 153 | } | |
| 154 | } | |
| 155 | ||
| 156 | private void handleTagComponents(State state, List<HtmlParseNode> nodes) { | |
| 157 |
1
1. handleTagComponents : negated conditional → KILLED |
if (hasFinishedBuildingTagname(state.hasEncounteredTagName, state.tagName, state.stringBuilder)) { |
| 158 | state.tagName = state.stringBuilder.toString(); | |
| 159 |
3
1. handleTagComponents : negated conditional → KILLED 2. handleTagComponents : negated conditional → KILLED 3. handleTagComponents : negated conditional → KILLED |
} else if (!state.stringBuilder.isEmpty() && state.currentAttributeKey.isBlank() && state.isReadingAttributeKey) { |
| 160 | state.attributes.put(state.stringBuilder.toString(), ""); | |
| 161 | state.stringBuilder = new StringBuilder(); | |
| 162 | state.isReadingAttributeKey = false; | |
| 163 |
1
1. handleTagComponents : negated conditional → KILLED |
} else if (!state.currentAttributeKey.isBlank()) { |
| 164 | // if we were in the midst of reading attribute stuff when we hit the closing bracket... | |
| 165 |
1
1. handleTagComponents : negated conditional → KILLED |
if (!state.stringBuilder.isEmpty()) { |
| 166 | state.attributes.put(state.currentAttributeKey, state.stringBuilder.toString()); | |
| 167 | } else { | |
| 168 | state.attributes.put(state.currentAttributeKey, ""); | |
| 169 | } | |
| 170 | state.isInsideAttributeValueQuoted = false; | |
| 171 | state.stringBuilder = new StringBuilder(); | |
| 172 | state.currentAttributeKey = ""; | |
| 173 | } | |
| 174 | ||
| 175 |
1
1. handleTagComponents : removed call to com/renomad/minum/htmlparsing/HtmlParser::processTagAndResetState → KILLED |
processTagAndResetState(state, nodes); |
| 176 | } | |
| 177 | ||
| 178 | static boolean hasFinishedBuildingTagname(boolean hasEncounteredTagName, String tagName, StringBuilder sb) { | |
| 179 |
4
1. hasFinishedBuildingTagname : negated conditional → KILLED 2. hasFinishedBuildingTagname : negated conditional → KILLED 3. hasFinishedBuildingTagname : replaced boolean return with true for com/renomad/minum/htmlparsing/HtmlParser::hasFinishedBuildingTagname → KILLED 4. hasFinishedBuildingTagname : negated conditional → KILLED |
return hasEncounteredTagName && tagName.isEmpty() && !sb.isEmpty(); |
| 180 | } | |
| 181 | ||
| 182 | private void processLessThan(char currentChar, State state) { | |
| 183 | /* less-than signs are policed strictly */ | |
| 184 |
1
1. processLessThan : negated conditional → KILLED |
if (state.isInsideAttributeValueQuoted) { |
| 185 | /* | |
| 186 | Here, we're looking at a less-than that | |
| 187 | is inside a quoted attribute value | |
| 188 | */ | |
| 189 | state.stringBuilder.append(currentChar); | |
| 190 | } else { | |
| 191 |
1
1. processLessThan : removed call to com/renomad/minum/htmlparsing/HtmlParser::enteringTag → KILLED |
enteringTag(state); |
| 192 | } | |
| 193 | } | |
| 194 | ||
| 195 | /** | |
| 196 | * When we've read a less-than sign and are entering an HTML tag. | |
| 197 | */ | |
| 198 | private void enteringTag(State state) { | |
| 199 |
1
1. enteringTag : removed call to com/renomad/minum/htmlparsing/HtmlParser::addText → KILLED |
addText(state); |
| 200 | ||
| 201 | state.isInsideTag = true; | |
| 202 | /* | |
| 203 | not really sure it's a start tag, but if we | |
| 204 | assume it is that's fine, because if we hit | |
| 205 | a forward slash at the beginning, it becomes | |
| 206 | a non-start-tag. | |
| 207 | */ | |
| 208 | state.isStartTag = true; | |
| 209 | state.stringBuilder = new StringBuilder(); | |
| 210 | } | |
| 211 | ||
| 212 | private static void addText(State state) { | |
| 213 |
1
1. addText : negated conditional → KILLED |
if (!state.stringBuilder.isEmpty()) { |
| 214 | ||
| 215 | String textContent = state.stringBuilder.toString(); | |
| 216 | ||
| 217 | // This is where we add characters if we found any between tags. | |
| 218 |
2
1. addText : negated conditional → KILLED 2. addText : negated conditional → KILLED |
if (! state.parseStack.isEmpty() && ! textContent.isBlank()) { |
| 219 |
1
1. addText : removed call to com/renomad/minum/htmlparsing/HtmlParseNode::addToInnerContent → KILLED |
state.parseStack.peek().addToInnerContent(new HtmlParseNode(ParseNodeType.CHARACTERS, TagInfo.EMPTY, new ArrayList<>(), textContent)); |
| 220 | } | |
| 221 | } | |
| 222 | } | |
| 223 | ||
| 224 | /** | |
| 225 | * Called when we've just hit a greater-than sign and thus | |
| 226 | * exited an HTML tag. | |
| 227 | */ | |
| 228 | private void processTagAndResetState(State state, List<HtmlParseNode> nodes) { | |
| 229 |
1
1. processTagAndResetState : removed call to com/renomad/minum/htmlparsing/HtmlParser::processTag → KILLED |
processTag(state, nodes); |
| 230 | ||
| 231 | state.isHalfClosedTag = false; | |
| 232 | state.isInsideTag = false; | |
| 233 | state.isStartTag = false; | |
| 234 | state.isReadingTagName = false; | |
| 235 | state.tagName = ""; | |
| 236 | state.attributes = new HashMap<>(); | |
| 237 | state.hasEncounteredTagName = false; | |
| 238 | state.stringBuilder = new StringBuilder(); | |
| 239 | } | |
| 240 | ||
| 241 | /** | |
| 242 | * The commonest case when reading characters. Buckle up. | |
| 243 | */ | |
| 244 | private void addingToken(State state, char currentChar) { | |
| 245 |
2
1. addingToken : negated conditional → KILLED 2. addingToken : negated conditional → KILLED |
var hasNotBegunReadingTagName = state.isInsideTag && !state.hasEncounteredTagName; |
| 246 | ||
| 247 |
1
1. addingToken : negated conditional → KILLED |
if (hasNotBegunReadingTagName) { |
| 248 |
1
1. addingToken : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleBeforeReadingTagName → KILLED |
handleBeforeReadingTagName(state, currentChar); |
| 249 |
1
1. addingToken : negated conditional → KILLED |
} else if (state.isReadingTagName) { |
| 250 |
1
1. addingToken : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleReadingTagName → KILLED |
handleReadingTagName(state, currentChar); |
| 251 |
1
1. addingToken : negated conditional → KILLED |
} else if (isFinishedReadingTag(state.tagName, state.isInsideTag)) { |
| 252 |
1
1. addingToken : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleAfterReadingTagName → KILLED |
handleAfterReadingTagName(state, currentChar); |
| 253 | } else { | |
| 254 | state.stringBuilder.append(currentChar); | |
| 255 | } | |
| 256 | } | |
| 257 | ||
| 258 | static boolean isFinishedReadingTag(String tagName, boolean isInsideTag) { | |
| 259 |
3
1. isFinishedReadingTag : negated conditional → KILLED 2. isFinishedReadingTag : replaced boolean return with true for com/renomad/minum/htmlparsing/HtmlParser::isFinishedReadingTag → KILLED 3. isFinishedReadingTag : negated conditional → KILLED |
return !tagName.isEmpty() && isInsideTag; |
| 260 | } | |
| 261 | ||
| 262 | static final List<Character> startOfComment = List.of('<', '!', '-', '-'); | |
| 263 | static final List<Character> endOfComment = List.of('-', '-', '>'); | |
| 264 | ||
| 265 | /** | |
| 266 | * Returns whether we are inside an HTML comment, | |
| 267 | * that is {@code <!-- -->} | |
| 268 | */ | |
| 269 | private void determineCommentState(State state) { | |
| 270 | boolean atCommentStart = state.previousCharacters.containsAt(startOfComment, 8); | |
| 271 | boolean atCommentEnd = state.previousCharacters.containsAt(endOfComment, 8); | |
| 272 | boolean isInsideTag = state.isInsideTag; | |
| 273 | boolean hasEncounteredTagName = state.hasEncounteredTagName; | |
| 274 |
3
1. determineCommentState : negated conditional → KILLED 2. determineCommentState : negated conditional → KILLED 3. determineCommentState : negated conditional → KILLED |
if (isInsideTag && !hasEncounteredTagName && atCommentStart) { |
| 275 | state.isInsideComment = true; | |
| 276 | state.isInsideTag = false; | |
| 277 |
2
1. determineCommentState : negated conditional → KILLED 2. determineCommentState : negated conditional → KILLED |
} else if (state.isInsideComment && atCommentEnd) { |
| 278 | state.isInsideComment = false; | |
| 279 | } | |
| 280 | } | |
| 281 | ||
| 282 | static final List<Character> scriptElement = List.of('<','/','s','c','r','i','p','t','>'); | |
| 283 | ||
| 284 | /** | |
| 285 | * Determines whether we have hit the end of the script block | |
| 286 | * by looking for the closing script tag. | |
| 287 | */ | |
| 288 | private void determineScriptState(State state) { | |
| 289 | boolean isScriptFinished = state.previousCharacters.containsAt(scriptElement, 3); | |
| 290 | boolean wasInsideScript = state.isInsideScript; | |
| 291 |
2
1. determineScriptState : negated conditional → KILLED 2. determineScriptState : negated conditional → KILLED |
state.isInsideScript = state.isInsideScript && !isScriptFinished; |
| 292 |
2
1. determineScriptState : negated conditional → KILLED 2. determineScriptState : negated conditional → KILLED |
boolean justClosedScriptTag = wasInsideScript && !state.isInsideScript; |
| 293 |
1
1. determineScriptState : negated conditional → KILLED |
if (justClosedScriptTag) { |
| 294 | state.tagName = "script"; | |
| 295 | state.isInsideTag = true; | |
| 296 | state.isStartTag = false; | |
| 297 | var innerTextLength = state.stringBuilder.length(); | |
| 298 |
1
1. determineScriptState : Replaced integer subtraction with addition → KILLED |
state.stringBuilder.delete(innerTextLength - 8, innerTextLength); |
| 299 |
1
1. determineScriptState : removed call to com/renomad/minum/htmlparsing/HtmlParser::addText → KILLED |
addText(state); |
| 300 | } | |
| 301 | ||
| 302 | } | |
| 303 | ||
| 304 | /** | |
| 305 | * at this point we have a tagname for our tag, and we're still in the tag | |
| 306 | */ | |
| 307 | private static void handleAfterReadingTagName(State state, char currentChar) { | |
| 308 | ||
| 309 | boolean isHandlingAttributes = isHandlingAttributes(state, currentChar); | |
| 310 |
1
1. handleAfterReadingTagName : negated conditional → KILLED |
if (isHandlingAttributes) { |
| 311 | ||
| 312 |
1
1. handleAfterReadingTagName : negated conditional → KILLED |
if (state.currentAttributeKey.isBlank()) { |
| 313 | /* | |
| 314 | because the key is blank, we know we haven't read it all. That's | |
| 315 | because when we finish reading the key, we'll add it to currentAttributeKey | |
| 316 | and be in the mode of reading the value. | |
| 317 | */ | |
| 318 |
1
1. handleAfterReadingTagName : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleNotFullyReadAttributeKey → KILLED |
handleNotFullyReadAttributeKey(state, currentChar); |
| 319 | } else { | |
| 320 | // reading in the (potential) attribute value | |
| 321 | ||
| 322 |
1
1. handleAfterReadingTagName : removed call to com/renomad/minum/htmlparsing/HtmlParser::handlePotentialAttributeValue → KILLED |
handlePotentialAttributeValue(state, currentChar); |
| 323 | } | |
| 324 | } | |
| 325 | } | |
| 326 | ||
| 327 | /** | |
| 328 | * Check whether we're past the whitespace between the tag name and | |
| 329 | * the start of the (potential) attribute key. | |
| 330 | */ | |
| 331 | static boolean isHandlingAttributes(State state, char currentChar) { | |
| 332 |
2
1. isHandlingAttributes : replaced boolean return with true for com/renomad/minum/htmlparsing/HtmlParser::isHandlingAttributes → KILLED 2. isHandlingAttributes : negated conditional → KILLED |
return ! (state.currentAttributeKey.isEmpty() && |
| 333 |
2
1. isHandlingAttributes : negated conditional → KILLED 2. isHandlingAttributes : negated conditional → KILLED |
state.stringBuilder.isEmpty() |
| 334 | && currentChar == ' '); | |
| 335 | } | |
| 336 | ||
| 337 | private static void handlePotentialAttributeValue(State state, char currentChar) { | |
| 338 |
1
1. handlePotentialAttributeValue : negated conditional → KILLED |
if (state.isInsideAttributeValueQuoted) { |
| 339 | // if we're already inside a quoted area, encountering a | |
| 340 | // closing quote will take us out of it. | |
| 341 |
1
1. handlePotentialAttributeValue : negated conditional → KILLED |
if (currentChar == state.quoteType.literal) { |
| 342 | // if we hit the matching end-quote, switch modes | |
| 343 | state.isInsideAttributeValueQuoted = false; | |
| 344 | state.quoteType = QuoteType.NONE; | |
| 345 | state.attributes.put(state.currentAttributeKey, state.stringBuilder.toString()); | |
| 346 | state.stringBuilder = new StringBuilder(); | |
| 347 | state.currentAttributeKey = ""; | |
| 348 | state.isReadingAttributeKey = false; | |
| 349 | } else { | |
| 350 | // otherwise keep on trucking, adding characters | |
| 351 | state.stringBuilder.append(currentChar); | |
| 352 | } | |
| 353 | } else { | |
| 354 |
2
1. handlePotentialAttributeValue : negated conditional → KILLED 2. handlePotentialAttributeValue : negated conditional → KILLED |
if (currentChar == '"' || currentChar == '\'') { |
| 355 | /* | |
| 356 | if we're not currently inside a quoted area but encounter | |
| 357 | a quote, switch modes. | |
| 358 | */ | |
| 359 | state.isInsideAttributeValueQuoted = true; | |
| 360 | state.quoteType = QuoteType.byLiteral(currentChar); | |
| 361 |
2
1. handlePotentialAttributeValue : negated conditional → KILLED 2. handlePotentialAttributeValue : negated conditional → KILLED |
} else if (!state.stringBuilder.isEmpty() && currentChar == ' ') { |
| 362 | /* | |
| 363 | if we're not in a quoted area and encounter a space, then | |
| 364 | we're done reading the attribute value and can add the key-value | |
| 365 | pair to the map. | |
| 366 | */ | |
| 367 | state.attributes.put(state.currentAttributeKey, state.stringBuilder.toString()); | |
| 368 | state.isReadingAttributeKey = false; | |
| 369 | state.stringBuilder = new StringBuilder(); | |
| 370 | state.currentAttributeKey = ""; | |
| 371 | } else { | |
| 372 | // otherwise keep trucking along adding characters | |
| 373 | state.stringBuilder.append(currentChar); | |
| 374 | } | |
| 375 | } | |
| 376 | } | |
| 377 | ||
| 378 | private static void handleNotFullyReadAttributeKey(State state, char currentChar) { | |
| 379 |
1
1. handleNotFullyReadAttributeKey : negated conditional → KILLED |
if (state.isHalfClosedTag) { |
| 380 | /* | |
| 381 | This situation occurs when we are in a void tag, like <link />, | |
| 382 | and are closing the tag with a forward slash + closing bracket. | |
| 383 | ||
| 384 | if we got here, it means the previous char was | |
| 385 | a forward slash, so the current character *should* | |
| 386 | be a closing angle, but if it's not ... | |
| 387 | */ | |
| 388 | throw new ParsingException(String.format("in closing a void tag (e.g. <link />), character after forward slash must be angle bracket. Char: %s at line %d and at the %d character. %d chars read in total.", currentChar, state.lineRow, state.lineColumn, state.charsRead)); | |
| 389 |
2
1. handleNotFullyReadAttributeKey : negated conditional → KILLED 2. handleNotFullyReadAttributeKey : negated conditional → KILLED |
} else if (currentChar == ' ' || currentChar == '=') { |
| 390 | // if we hit whitespace or an equals sign, we're done reading the key | |
| 391 | state.currentAttributeKey = state.stringBuilder.toString(); | |
| 392 | state.isReadingAttributeKey = false; | |
| 393 | state.stringBuilder = new StringBuilder(); | |
| 394 |
1
1. handleNotFullyReadAttributeKey : negated conditional → KILLED |
} else if (currentChar == '/') { |
| 395 | // a forward-slash cannot be in the attribute key | |
| 396 | state.isReadingAttributeKey = false; | |
| 397 | state.isHalfClosedTag = true; | |
| 398 | } else { | |
| 399 | // otherwise keep on reading | |
| 400 | state.stringBuilder.append(currentChar); | |
| 401 | // and note we are reading the key | |
| 402 | state.isReadingAttributeKey = true; | |
| 403 | } | |
| 404 | } | |
| 405 | ||
| 406 | private static void handleReadingTagName(State state, char currentChar) { | |
| 407 |
1
1. handleReadingTagName : negated conditional → KILLED |
if (Character.isWhitespace(currentChar)) { |
| 408 | /* | |
| 409 | At this point, we've been reading the tag name, and we've encountered whitespace. | |
| 410 | That means we are done reading the tag name | |
| 411 | */ | |
| 412 | state.hasEncounteredTagName = true; | |
| 413 | state.isReadingTagName = false; | |
| 414 | state.tagName = state.stringBuilder.toString(); | |
| 415 | state.attributes = new HashMap<>(); | |
| 416 | state.stringBuilder = new StringBuilder(); | |
| 417 | } else { | |
| 418 | /* | |
| 419 | Reading the characters of the tag name | |
| 420 | */ | |
| 421 | state.hasEncounteredTagName = true; | |
| 422 | state.tagName = ""; | |
| 423 | state.stringBuilder.append(currentChar); | |
| 424 | } | |
| 425 | } | |
| 426 | ||
| 427 | /** | |
| 428 | * We're just past a starting angle bracket, so we're | |
| 429 | * feeling our way around what this element is. | |
| 430 | */ | |
| 431 | private static void handleBeforeReadingTagName(State state, char currentChar) { | |
| 432 |
1
1. handleBeforeReadingTagName : negated conditional → KILLED |
if (currentChar == ' ') { |
| 433 | /* | |
| 434 | At this point, we're inside the tag, and we've encountered whitespace. | |
| 435 | Seeking the tag name (although we may be inside a closing tag). | |
| 436 | */ | |
| 437 | state.stringBuilder = new StringBuilder(); | |
| 438 |
1
1. handleBeforeReadingTagName : negated conditional → KILLED |
} else if (currentChar == '/') { |
| 439 | /* | |
| 440 | hitting a forward-slash symbol means we're looking | |
| 441 | at the closure of a tag | |
| 442 | */ | |
| 443 | state.isStartTag = false; | |
| 444 | state.stringBuilder = new StringBuilder(); | |
| 445 |
1
1. handleBeforeReadingTagName : negated conditional → KILLED |
} else if (Character.isAlphabetic(currentChar)) { |
| 446 | ||
| 447 | /* | |
| 448 | Here, our input could definitely be the letters of a tag name | |
| 449 | */ | |
| 450 | state.hasEncounteredTagName = true; | |
| 451 | state.isReadingTagName = true; | |
| 452 | state.stringBuilder.append(currentChar); | |
| 453 | } | |
| 454 | } | |
| 455 | ||
| 456 | /** | |
| 457 | * This examines the results of reading a tag - if it's | |
| 458 | * a start tag, it pushes it onto a stack for later | |
| 459 | * comparison to the end tag. The stack is a key | |
| 460 | * component of how we are able to nest the tags properly. | |
| 461 | */ | |
| 462 | private void processTag(State state, List<HtmlParseNode> nodes) { | |
| 463 | String tagNameString = state.tagName; | |
| 464 | TagName tagName; | |
| 465 | ||
| 466 | tagName = TagName.findMatchingTagname(tagNameString); | |
| 467 |
1
1. processTag : negated conditional → KILLED |
if (tagName.equals(TagName.UNRECOGNIZED)) return; |
| 468 | var tagInfo = new TagInfo(tagName, state.attributes); | |
| 469 |
1
1. processTag : negated conditional → KILLED |
if (state.isStartTag) { |
| 470 | HtmlParseNode newNode = new HtmlParseNode(ParseNodeType.ELEMENT, tagInfo, new ArrayList<>(), ""); | |
| 471 | ||
| 472 |
1
1. processTag : negated conditional → KILLED |
if (! state.parseStack.isEmpty()) { |
| 473 | // if we're inside an html element, | |
| 474 | // add this to the inner content | |
| 475 |
1
1. processTag : removed call to com/renomad/minum/htmlparsing/HtmlParseNode::addToInnerContent → KILLED |
state.parseStack.peek().addToInnerContent(newNode); |
| 476 | } | |
| 477 | ||
| 478 |
2
1. processTag : negated conditional → KILLED 2. processTag : negated conditional → KILLED |
if (state.parseStack.isEmpty() && tagName.isVoidElement) { |
| 479 | // if we're at the root level and encountering a void element, | |
| 480 | // add it to the root-level list of nodes | |
| 481 | nodes.add(newNode); | |
| 482 |
1
1. processTag : negated conditional → KILLED |
} else if (!tagName.isVoidElement) { |
| 483 |
1
1. processTag : removed call to java/util/Deque::push → KILLED |
state.parseStack.push(newNode); |
| 484 | } | |
| 485 | ||
| 486 |
1
1. processTag : negated conditional → KILLED |
if (tagName.equals(TagName.SCRIPT)) { |
| 487 | state.isInsideScript = true; | |
| 488 | state.stringBuilder = new StringBuilder(); | |
| 489 | } | |
| 490 | } else { | |
| 491 | // if we're leaving an end-tag, it means we have a | |
| 492 | // full element with potentially inner content | |
| 493 | HtmlParseNode htmlParseNode; | |
| 494 | try { | |
| 495 | htmlParseNode = state.parseStack.pop(); | |
| 496 | } catch (NoSuchElementException ex) { | |
| 497 | throw new ParsingException("No starting tag found. At line " + state.lineRow + " and at the " + state.lineColumn + "th character. " + state.charsRead + " characters read in total."); | |
| 498 | } | |
| 499 | ||
| 500 | /* | |
| 501 | If the stack is a size of zero at this point, it means we're at the | |
| 502 | roots of our HTML code, which means it's the proper time to add the | |
| 503 | topmost element we just popped into a list. | |
| 504 | */ | |
| 505 |
1
1. processTag : negated conditional → KILLED |
if (state.parseStack.isEmpty()) { |
| 506 | nodes.add(htmlParseNode); | |
| 507 | } | |
| 508 | TagName expectedTagName = htmlParseNode.getTagInfo().getTagName(); | |
| 509 |
1
1. processTag : negated conditional → KILLED |
if (expectedTagName != tagName) { |
| 510 | throw new ParsingException("Did not find expected closing-tag type. " + "Expected: " + expectedTagName + " at line " + state.lineRow + " and at the " + state.lineColumn + "th character. " + state.charsRead + " characters read in total."); | |
| 511 | } | |
| 512 | } | |
| 513 | } | |
| 514 | ||
| 515 | enum QuoteType { | |
| 516 | SINGLE_QUOTED('\''), DOUBLE_QUOTED('"'), NONE(Character.MIN_VALUE); | |
| 517 | ||
| 518 | public final char literal; | |
| 519 | ||
| 520 | QuoteType(char literal) { | |
| 521 | this.literal = literal; | |
| 522 | } | |
| 523 | ||
| 524 | public static QuoteType byLiteral(char currentChar) { | |
| 525 |
1
1. byLiteral : negated conditional → KILLED |
if (currentChar == '\'') { |
| 526 |
1
1. byLiteral : replaced return value with null for com/renomad/minum/htmlparsing/HtmlParser$QuoteType::byLiteral → KILLED |
return QuoteType.SINGLE_QUOTED; |
| 527 | } else { | |
| 528 |
1
1. byLiteral : replaced return value with null for com/renomad/minum/htmlparsing/HtmlParser$QuoteType::byLiteral → KILLED |
return QuoteType.DOUBLE_QUOTED; |
| 529 | } | |
| 530 | } | |
| 531 | } | |
| 532 | ||
| 533 | static class State { | |
| 534 | ||
| 535 | static State buildNewState() { | |
| 536 | RingBuffer<Character> previousCharacters = new RingBuffer<>(12, Character.class); | |
| 537 | int lineColumn1 = 0; | |
| 538 | int lineRow1 = 1; | |
| 539 | boolean isHalfClosedTag1 = false; | |
| 540 | boolean isInsideAttributeValueQuoted1 = false; | |
| 541 | boolean isStartTag1 = true; | |
| 542 | boolean isReadingTagName1 = false; | |
| 543 | boolean hasEncounteredTagName1 = false; | |
| 544 | ArrayDeque<HtmlParseNode> parseStack1 = new ArrayDeque<>(); | |
| 545 | StringBuilder stringBuilder1 = new StringBuilder(); | |
| 546 | boolean isInsideTag1 = false; | |
| 547 | int charsRead1 = 0; | |
| 548 | String tagName1 = ""; | |
| 549 | String currentAttributeKey1 = ""; | |
| 550 | HashMap<String, String> attributes1 = new HashMap<>(); | |
| 551 | boolean isReadingAttributeKey1 = false; | |
| 552 | boolean isInsideComment1 = false; | |
| 553 | boolean isInsideScript1 = false; | |
| 554 |
1
1. buildNewState : replaced return value with null for com/renomad/minum/htmlparsing/HtmlParser$State::buildNewState → KILLED |
return new State(charsRead1, isInsideTag1, stringBuilder1, parseStack1, hasEncounteredTagName1, |
| 555 | isReadingTagName1, isStartTag1, isInsideAttributeValueQuoted1, | |
| 556 | tagName1, currentAttributeKey1, attributes1, QuoteType.NONE, isReadingAttributeKey1, | |
| 557 | isHalfClosedTag1, lineRow1, lineColumn1, previousCharacters, isInsideComment1, isInsideScript1); | |
| 558 | } | |
| 559 | ||
| 560 | /** | |
| 561 | * If we encounter a forward-slash in a tag, and we're | |
| 562 | * not in the midst of reading an attribute value, then | |
| 563 | * we expect the next character to be a greater-than symbol. | |
| 564 | */ | |
| 565 | boolean isHalfClosedTag; | |
| 566 | /** | |
| 567 | * total number of chars read of this HTML file | |
| 568 | */ | |
| 569 | int charsRead; | |
| 570 | /** | |
| 571 | * True if we are inside angle brackets (may be a closing tag) | |
| 572 | */ | |
| 573 | boolean isInsideTag; | |
| 574 | /** | |
| 575 | * Where we build up tokens a character at a time | |
| 576 | */ | |
| 577 | StringBuilder stringBuilder; | |
| 578 | /** | |
| 579 | * A stack of HtmlParseNodes, used to see how far deep in the tree we are | |
| 580 | */ | |
| 581 | final Deque<HtmlParseNode> parseStack; | |
| 582 | /** | |
| 583 | * True if we have successfully encountered the first letter of the tag | |
| 584 | */ | |
| 585 | boolean hasEncounteredTagName; | |
| 586 | /** | |
| 587 | * True if we are in the process of reading the tag (e.g. p, a, h1, etc) | |
| 588 | */ | |
| 589 | boolean isReadingTagName; | |
| 590 | ||
| 591 | /** | |
| 592 | * if we determine we are in the midst of reading an attribute key | |
| 593 | */ | |
| 594 | boolean isReadingAttributeKey; | |
| 595 | ||
| 596 | /** | |
| 597 | * True if we determine we are probably in the start tag (rather than the closing tag) | |
| 598 | */ | |
| 599 | boolean isStartTag; | |
| 600 | /** | |
| 601 | * True if we're inside the quoted area inside an attribute value in an element | |
| 602 | * tag - this could be where we encounter some symbols that may not be allowed elsewhere. | |
| 603 | */ | |
| 604 | boolean isInsideAttributeValueQuoted; | |
| 605 | /** | |
| 606 | * If we're in a quoted area, it's either single or double-quoted. | |
| 607 | * These quotes need to be paired properly, so we need to keep track. | |
| 608 | */ | |
| 609 | QuoteType quoteType; | |
| 610 | /** | |
| 611 | * The string value of the tag name | |
| 612 | */ | |
| 613 | String tagName; | |
| 614 | /** | |
| 615 | * The attribute key we just read | |
| 616 | */ | |
| 617 | String currentAttributeKey; | |
| 618 | /** | |
| 619 | * a map of string to values (in some cases there won't be an equals | |
| 620 | * sign, meaning the value is null. In other cases there will be an | |
| 621 | * equals sign but no value, meaning the value is empty string) | |
| 622 | */ | |
| 623 | Map<String, String> attributes; | |
| 624 | ||
| 625 | /** | |
| 626 | * indicate which line we're on in debugging | |
| 627 | */ | |
| 628 | int lineRow; | |
| 629 | ||
| 630 | /** | |
| 631 | * How far we are from the last newline character, including | |
| 632 | * all whitespace as well. | |
| 633 | */ | |
| 634 | int lineColumn; | |
| 635 | ||
| 636 | /** | |
| 637 | * This is used to check for comments and script tags, like: | |
| 638 | * {@code <!-- -->} and {@code <script>} | |
| 639 | */ | |
| 640 | final RingBuffer<Character> previousCharacters; | |
| 641 | ||
| 642 | /** | |
| 643 | * Indicates whether we are inside a comment | |
| 644 | */ | |
| 645 | boolean isInsideComment; | |
| 646 | ||
| 647 | boolean isInsideScript; | |
| 648 | ||
| 649 | /** | |
| 650 | * Holds the state so we can remember where we are as we examine the HTML | |
| 651 | * a character at a time. | |
| 652 | */ | |
| 653 | public State(int charsRead, boolean isInsideTag, StringBuilder stringBuilder, | |
| 654 | Deque<HtmlParseNode> parseStack, boolean hasEncounteredTagName, boolean isReadingTagName, | |
| 655 | boolean isStartTag, boolean isInsideAttributeValueQuoted, String tagName, | |
| 656 | String currentAttributeKey, Map<String, String> attributes, QuoteType quoteType, | |
| 657 | boolean isReadingAttributeKey, boolean isHalfClosedTag, int lineRow, int lineColumn, | |
| 658 | RingBuffer<Character> previousCharacters, boolean isInsideComment, boolean isInsideScript) { | |
| 659 | ||
| 660 | this.charsRead = charsRead; | |
| 661 | this.isInsideTag = isInsideTag; | |
| 662 | this.stringBuilder = stringBuilder; | |
| 663 | this.parseStack = parseStack; | |
| 664 | this.hasEncounteredTagName = hasEncounteredTagName; | |
| 665 | this.isReadingTagName = isReadingTagName; | |
| 666 | this.isStartTag = isStartTag; | |
| 667 | this.isInsideAttributeValueQuoted = isInsideAttributeValueQuoted; | |
| 668 | this.tagName = tagName; | |
| 669 | this.currentAttributeKey = currentAttributeKey; | |
| 670 | this.attributes = attributes; | |
| 671 | this.quoteType = quoteType; | |
| 672 | this.isReadingAttributeKey = isReadingAttributeKey; | |
| 673 | this.isHalfClosedTag = isHalfClosedTag; | |
| 674 | this.lineRow = lineRow; | |
| 675 | this.lineColumn = lineColumn; | |
| 676 | this.previousCharacters = previousCharacters; | |
| 677 | this.isInsideComment = isInsideComment; | |
| 678 | this.isInsideScript = isInsideScript; | |
| 679 | } | |
| 680 | } | |
| 681 | ||
| 682 | /** | |
| 683 | * Search the node tree for matching elements. | |
| 684 | * <p> | |
| 685 | * If zero nodes are found, returns an empty list. | |
| 686 | * </p> | |
| 687 | */ | |
| 688 | public List<HtmlParseNode> search(List<HtmlParseNode> nodes, TagName tagName, Map<String, String> attributes) { | |
| 689 | List<HtmlParseNode> foundNodes = new ArrayList<>(); | |
| 690 | for (var node : nodes) { | |
| 691 | var result = node.search(tagName, attributes); | |
| 692 | foundNodes.addAll(result); | |
| 693 | } | |
| 694 |
1
1. search : replaced return value with Collections.emptyList for com/renomad/minum/htmlparsing/HtmlParser::search → KILLED |
return foundNodes; |
| 695 | } | |
| 696 | ||
| 697 | } | |
Mutations | ||
| 61 |
1.1 2.2 |
|
| 71 |
1.1 2.2 |
|
| 74 |
1.1 |
|
| 91 |
1.1 |
|
| 94 |
1.1 |
|
| 95 |
1.1 |
|
| 96 |
1.1 |
|
| 97 |
1.1 |
|
| 100 |
1.1 |
|
| 105 |
1.1 |
|
| 106 |
1.1 |
|
| 107 |
1.1 |
|
| 108 |
1.1 |
|
| 110 |
1.1 |
|
| 119 |
1.1 |
|
| 120 |
1.1 |
|
| 121 |
1.1 |
|
| 124 |
1.1 |
|
| 129 |
1.1 |
|
| 130 |
1.1 |
|
| 145 |
1.1 |
|
| 152 |
1.1 |
|
| 157 |
1.1 |
|
| 159 |
1.1 2.2 3.3 |
|
| 163 |
1.1 |
|
| 165 |
1.1 |
|
| 175 |
1.1 |
|
| 179 |
1.1 2.2 3.3 4.4 |
|
| 184 |
1.1 |
|
| 191 |
1.1 |
|
| 199 |
1.1 |
|
| 213 |
1.1 |
|
| 218 |
1.1 2.2 |
|
| 219 |
1.1 |
|
| 229 |
1.1 |
|
| 245 |
1.1 2.2 |
|
| 247 |
1.1 |
|
| 248 |
1.1 |
|
| 249 |
1.1 |
|
| 250 |
1.1 |
|
| 251 |
1.1 |
|
| 252 |
1.1 |
|
| 259 |
1.1 2.2 3.3 |
|
| 274 |
1.1 2.2 3.3 |
|
| 277 |
1.1 2.2 |
|
| 291 |
1.1 2.2 |
|
| 292 |
1.1 2.2 |
|
| 293 |
1.1 |
|
| 298 |
1.1 |
|
| 299 |
1.1 |
|
| 310 |
1.1 |
|
| 312 |
1.1 |
|
| 318 |
1.1 |
|
| 322 |
1.1 |
|
| 332 |
1.1 2.2 |
|
| 333 |
1.1 2.2 |
|
| 338 |
1.1 |
|
| 341 |
1.1 |
|
| 354 |
1.1 2.2 |
|
| 361 |
1.1 2.2 |
|
| 379 |
1.1 |
|
| 389 |
1.1 2.2 |
|
| 394 |
1.1 |
|
| 407 |
1.1 |
|
| 432 |
1.1 |
|
| 438 |
1.1 |
|
| 445 |
1.1 |
|
| 467 |
1.1 |
|
| 469 |
1.1 |
|
| 472 |
1.1 |
|
| 475 |
1.1 |
|
| 478 |
1.1 2.2 |
|
| 482 |
1.1 |
|
| 483 |
1.1 |
|
| 486 |
1.1 |
|
| 505 |
1.1 |
|
| 509 |
1.1 |
|
| 525 |
1.1 |
|
| 526 |
1.1 |
|
| 528 |
1.1 |
|
| 554 |
1.1 |
|
| 694 |
1.1 |