| 1 | package com.renomad.minum.htmlparsing; | |
| 2 | ||
| 3 | import com.renomad.minum.security.ForbiddenUseException; | |
| 4 | import com.renomad.minum.utils.RingBuffer; | |
| 5 | ||
| 6 | import java.util.*; | |
| 7 | ||
| 8 | /** | |
| 9 | * Converts HTML strings to object trees. | |
| 10 | * <p> | |
| 11 | * Enables a developer to analyze an HTML document by its | |
| 12 | * structure. | |
| 13 | * </p> | |
| 14 | * <p> | |
| 15 | * Note: HTML parsing is difficult because | |
| 16 | * of its lenient specification. See Postel's Law. | |
| 17 | * </p> | |
| 18 | * <p> | |
| 19 | * For our purposes, it is less important | |
| 20 | * to perfectly meet the criteria of the spec, so | |
| 21 | * there will be numerous edge-cases unaccounted-for | |
| 22 | * by this implementation. Nevertheless, this program | |
| 23 | * should suit many needs for ordinary web applications. | |
| 24 | * </p> | |
| 25 | */ | |
| 26 | public final class HtmlParser { | |
| 27 | ||
| 28 | /** | |
| 29 | * Most total chars we'll read. | |
| 30 | */ | |
| 31 | static final int MAX_HTML_SIZE = 2 * 1024 * 1024; | |
| 32 | ||
| 33 | /** | |
| 34 | * Given any HTML input, scan through and generate a tree | |
| 35 | * of HTML nodes. Return a list of the roots of the tree. | |
| 36 | * <p> | |
| 37 | * This parser operates with a very particular paradigm in mind. I'll explain | |
| 38 | * it through examples. Let's look at some typical HTML: | |
| 39 | * </p> | |
| 40 | * <pre>{@code <p>Hello world</p>}</pre> | |
| 41 | * <p> | |
| 42 | * The way we will model this is as follows: | |
| 43 | * </p> | |
| 44 | * <pre>{@code <ELEMENT_NAME_AND_DETAILS>content<END_OF_ELEMENT>}</pre> | |
| 45 | * <p> | |
| 46 | * We will examine the first part, "ELEMENT_NAME_AND_DETAILS", and | |
| 47 | * grab the element's name and any attributes. Then we will descend into the | |
| 48 | * content section. We know we have hit the end of the element by keeping | |
| 49 | * track of how far we have descended/ascended and whether we are hitting | |
| 50 | * a closing HTML element. | |
| 51 | * </p> | |
| 52 | * <p> | |
| 53 | * Complicating this is that elements may not have content, for example | |
| 54 | * any <a href="https://www.w3.org/TR/2011/WD-html-markup-20110113/syntax.html#void-element_xref3">void elements</a> | |
| 55 | * or when a user chooses to create an empty tag | |
| 56 | * </p> | |
| 57 | */ | |
| 58 | public List<HtmlParseNode> parse(String input) { | |
| 59 |
2
1. parse : changed conditional boundary → KILLED 2. parse : negated conditional → KILLED |
if (input.length() > MAX_HTML_SIZE) |
| 60 | throw new ForbiddenUseException("Input exceeds max allowed HTML text size, " + MAX_HTML_SIZE + " chars"); | |
| 61 | var chars = input.toCharArray(); | |
| 62 | List<HtmlParseNode> nodes = new ArrayList<>(); | |
| 63 | State state = State.buildNewState(); | |
| 64 | ||
| 65 | for (char currentChar : chars) { | |
| 66 |
1
1. parse : removed call to com/renomad/minum/htmlparsing/HtmlParser::processState → KILLED |
processState(currentChar, state, nodes); |
| 67 | } | |
| 68 |
1
1. parse : replaced return value with Collections.emptyList for com/renomad/minum/htmlparsing/HtmlParser::parse → KILLED |
return nodes; |
| 69 | } | |
| 70 | ||
| 71 | /** | |
| 72 | * Use important symbols in the HTML code to indicate | |
| 73 | * which mode we are in - reading inside a tag, or between | |
| 74 | * tags. | |
| 75 | * <p> | |
| 76 | * Apologies to future readers. Hand-written parser code is the suck. | |
| 77 | * </p> | |
| 78 | * <p> | |
| 79 | * That said, there are plenty of tests exercising this, and it is | |
| 80 | * easy to test due to having been built using TDD. Cold comfort, I know. | |
| 81 | * </p> | |
| 82 | */ | |
| 83 | private void processState(char currentChar, State state, List<HtmlParseNode> nodes) { | |
| 84 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::recordLocation → KILLED |
recordLocation(currentChar, state); |
| 85 | ||
| 86 | // keep track of previous twelve characters, to check if inside comments and scripts | |
| 87 |
1
1. processState : removed call to com/renomad/minum/utils/RingBuffer::add → KILLED |
state.previousCharacters.add(currentChar); |
| 88 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::determineCommentState → KILLED |
determineCommentState(state); |
| 89 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::determineScriptState → KILLED |
determineScriptState(state); |
| 90 |
1
1. processState : negated conditional → KILLED |
if (state.isInsideComment) { |
| 91 | return; | |
| 92 | } | |
| 93 |
1
1. processState : negated conditional → KILLED |
if (state.isInsideScript) { |
| 94 | state.stringBuilder.append(currentChar); | |
| 95 | return; | |
| 96 | } | |
| 97 | ||
| 98 |
1
1. processState : negated conditional → KILLED |
if (currentChar == '<') { |
| 99 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::processLessThan → KILLED |
processLessThan(currentChar, state); |
| 100 |
1
1. processState : negated conditional → KILLED |
} else if (currentChar == '>') { |
| 101 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::processGreaterThan → KILLED |
processGreaterThan(currentChar, state, nodes); |
| 102 | } else { | |
| 103 |
1
1. processState : removed call to com/renomad/minum/htmlparsing/HtmlParser::addingToken → KILLED |
addingToken(state, currentChar); |
| 104 | } | |
| 105 | } | |
| 106 | ||
| 107 | /** | |
| 108 | * handle basic recording of stats, like row and column, | |
| 109 | * useful during error messages | |
| 110 | */ | |
| 111 | private static void recordLocation(char currentChar, State state) { | |
| 112 |
1
1. recordLocation : Replaced integer addition with subtraction → KILLED |
state.charsRead += 1; |
| 113 |
1
1. recordLocation : negated conditional → KILLED |
if (currentChar == '\n') { |
| 114 |
1
1. recordLocation : Replaced integer addition with subtraction → KILLED |
state.lineRow += 1; |
| 115 | state.lineColumn = 0; | |
| 116 | } | |
| 117 |
1
1. recordLocation : Replaced integer addition with subtraction → KILLED |
state.lineColumn += 1; |
| 118 | } | |
| 119 | ||
| 120 | private void processGreaterThan(char currentChar, State state, List<HtmlParseNode> nodes) { | |
| 121 | /* It's allowed to use greater-than signs in a lot of places */ | |
| 122 |
1
1. processGreaterThan : negated conditional → KILLED |
if (state.isInsideTag) { |
| 123 |
1
1. processGreaterThan : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleExitingTag → KILLED |
handleExitingTag(currentChar, state, nodes); |
| 124 | } else { | |
| 125 | /* | |
| 126 | This situation means we're looking at a | |
| 127 | free-floating greater-than symbol in | |
| 128 | the html text. | |
| 129 | */ | |
| 130 | state.stringBuilder.append(currentChar); | |
| 131 | } | |
| 132 | } | |
| 133 | ||
| 134 | /** | |
| 135 | * As we leave the tag, we make some decisions about it. | |
| 136 | */ | |
| 137 | private void handleExitingTag(char currentChar, State state, List<HtmlParseNode> nodes) { | |
| 138 |
1
1. handleExitingTag : negated conditional → KILLED |
if (state.isInsideAttributeValueQuoted) { |
| 139 | /* | |
| 140 | Here, we're looking at a greater-than | |
| 141 | that is inside a quoted attribute value | |
| 142 | */ | |
| 143 | state.stringBuilder.append(currentChar); | |
| 144 | } else { | |
| 145 |
1
1. handleExitingTag : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleTagComponents → KILLED |
handleTagComponents(state, nodes); |
| 146 | } | |
| 147 | } | |
| 148 | ||
| 149 | private void handleTagComponents(State state, List<HtmlParseNode> nodes) { | |
| 150 |
1
1. handleTagComponents : negated conditional → KILLED |
if (hasFinishedBuildingTagname(state.hasEncounteredTagName, state.tagName, state.stringBuilder)) { |
| 151 | state.tagName = state.stringBuilder.toString(); | |
| 152 |
3
1. handleTagComponents : negated conditional → KILLED 2. handleTagComponents : negated conditional → KILLED 3. handleTagComponents : negated conditional → KILLED |
} else if (!state.stringBuilder.isEmpty() && state.currentAttributeKey.isBlank() && state.isReadingAttributeKey) { |
| 153 | state.attributes.put(state.stringBuilder.toString(), ""); | |
| 154 | state.stringBuilder = new StringBuilder(); | |
| 155 | state.isReadingAttributeKey = false; | |
| 156 |
1
1. handleTagComponents : negated conditional → KILLED |
} else if (!state.currentAttributeKey.isBlank()) { |
| 157 | // if we were in the midst of reading attribute stuff when we hit the closing bracket... | |
| 158 |
1
1. handleTagComponents : negated conditional → KILLED |
if (!state.stringBuilder.isEmpty()) { |
| 159 | state.attributes.put(state.currentAttributeKey, state.stringBuilder.toString()); | |
| 160 | } else { | |
| 161 | state.attributes.put(state.currentAttributeKey, ""); | |
| 162 | } | |
| 163 | state.isInsideAttributeValueQuoted = false; | |
| 164 | state.stringBuilder = new StringBuilder(); | |
| 165 | state.currentAttributeKey = ""; | |
| 166 | } | |
| 167 | ||
| 168 |
1
1. handleTagComponents : removed call to com/renomad/minum/htmlparsing/HtmlParser::processTagAndResetState → KILLED |
processTagAndResetState(state, nodes); |
| 169 | } | |
| 170 | ||
| 171 | static boolean hasFinishedBuildingTagname(boolean hasEncounteredTagName, String tagName, StringBuilder sb) { | |
| 172 |
4
1. hasFinishedBuildingTagname : negated conditional → KILLED 2. hasFinishedBuildingTagname : negated conditional → KILLED 3. hasFinishedBuildingTagname : replaced boolean return with true for com/renomad/minum/htmlparsing/HtmlParser::hasFinishedBuildingTagname → KILLED 4. hasFinishedBuildingTagname : negated conditional → KILLED |
return hasEncounteredTagName && tagName.isEmpty() && !sb.isEmpty(); |
| 173 | } | |
| 174 | ||
| 175 | private void processLessThan(char currentChar, State state) { | |
| 176 | /* less-than signs are policed strictly */ | |
| 177 |
1
1. processLessThan : negated conditional → KILLED |
if (state.isInsideAttributeValueQuoted) { |
| 178 | /* | |
| 179 | Here, we're looking at a less-than that | |
| 180 | is inside a quoted attribute value | |
| 181 | */ | |
| 182 | state.stringBuilder.append(currentChar); | |
| 183 | } else { | |
| 184 |
1
1. processLessThan : removed call to com/renomad/minum/htmlparsing/HtmlParser::enteringTag → KILLED |
enteringTag(state); |
| 185 | } | |
| 186 | } | |
| 187 | ||
| 188 | /** | |
| 189 | * When we've read a less-than sign and are entering an HTML tag. | |
| 190 | */ | |
| 191 | private void enteringTag(State state) { | |
| 192 |
1
1. enteringTag : removed call to com/renomad/minum/htmlparsing/HtmlParser::addText → KILLED |
addText(state); |
| 193 | ||
| 194 | state.isInsideTag = true; | |
| 195 | /* | |
| 196 | not really sure it's a start tag, but if we | |
| 197 | assume it is that's fine, because if we hit | |
| 198 | a forward slash at the beginning, it becomes | |
| 199 | a non-start-tag. | |
| 200 | */ | |
| 201 | state.isStartTag = true; | |
| 202 | state.stringBuilder = new StringBuilder(); | |
| 203 | } | |
| 204 | ||
| 205 | private static void addText(State state) { | |
| 206 |
1
1. addText : negated conditional → KILLED |
if (!state.stringBuilder.isEmpty()) { |
| 207 | ||
| 208 | String textContent = state.stringBuilder.toString(); | |
| 209 | ||
| 210 | // This is where we add characters if we found any between tags. | |
| 211 |
2
1. addText : negated conditional → KILLED 2. addText : negated conditional → KILLED |
if (! state.parseStack.isEmpty() && ! textContent.isBlank()) { |
| 212 |
1
1. addText : removed call to com/renomad/minum/htmlparsing/HtmlParseNode::addToInnerContent → KILLED |
state.parseStack.peek().addToInnerContent(new HtmlParseNode(ParseNodeType.CHARACTERS, TagInfo.EMPTY, new ArrayList<>(), textContent)); |
| 213 | } | |
| 214 | } | |
| 215 | } | |
| 216 | ||
| 217 | /** | |
| 218 | * Called when we've just hit a greater-than sign and thus | |
| 219 | * exited an HTML tag. | |
| 220 | */ | |
| 221 | private void processTagAndResetState(State state, List<HtmlParseNode> nodes) { | |
| 222 |
1
1. processTagAndResetState : removed call to com/renomad/minum/htmlparsing/HtmlParser::processTag → KILLED |
processTag(state, nodes); |
| 223 | ||
| 224 | state.isHalfClosedTag = false; | |
| 225 | state.isInsideTag = false; | |
| 226 | state.isStartTag = false; | |
| 227 | state.isReadingTagName = false; | |
| 228 | state.tagName = ""; | |
| 229 | state.attributes = new HashMap<>(); | |
| 230 | state.hasEncounteredTagName = false; | |
| 231 | state.stringBuilder = new StringBuilder(); | |
| 232 | } | |
| 233 | ||
| 234 | /** | |
| 235 | * The commonest case when reading characters. Buckle up. | |
| 236 | */ | |
| 237 | private void addingToken(State state, char currentChar) { | |
| 238 |
2
1. addingToken : negated conditional → KILLED 2. addingToken : negated conditional → KILLED |
var hasNotBegunReadingTagName = state.isInsideTag && !state.hasEncounteredTagName; |
| 239 | ||
| 240 |
1
1. addingToken : negated conditional → KILLED |
if (hasNotBegunReadingTagName) { |
| 241 |
1
1. addingToken : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleBeforeReadingTagName → KILLED |
handleBeforeReadingTagName(state, currentChar); |
| 242 |
1
1. addingToken : negated conditional → KILLED |
} else if (state.isReadingTagName) { |
| 243 |
1
1. addingToken : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleReadingTagName → KILLED |
handleReadingTagName(state, currentChar); |
| 244 |
1
1. addingToken : negated conditional → KILLED |
} else if (isFinishedReadingTag(state.tagName, state.isInsideTag)) { |
| 245 |
1
1. addingToken : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleAfterReadingTagName → KILLED |
handleAfterReadingTagName(state, currentChar); |
| 246 | } else { | |
| 247 | state.stringBuilder.append(currentChar); | |
| 248 | } | |
| 249 | } | |
| 250 | ||
| 251 | static boolean isFinishedReadingTag(String tagName, boolean isInsideTag) { | |
| 252 |
3
1. isFinishedReadingTag : negated conditional → KILLED 2. isFinishedReadingTag : replaced boolean return with true for com/renomad/minum/htmlparsing/HtmlParser::isFinishedReadingTag → KILLED 3. isFinishedReadingTag : negated conditional → KILLED |
return !tagName.isEmpty() && isInsideTag; |
| 253 | } | |
| 254 | ||
| 255 | static final List<Character> startOfComment = List.of('<', '!', '-', '-'); | |
| 256 | static final List<Character> endOfComment = List.of('-', '-', '>'); | |
| 257 | ||
| 258 | /** | |
| 259 | * Returns whether we are inside an HTML comment, | |
| 260 | * that is {@code <!-- -->} | |
| 261 | */ | |
| 262 | private void determineCommentState(State state) { | |
| 263 | boolean atCommentStart = state.previousCharacters.containsAt(startOfComment, 8); | |
| 264 | boolean atCommentEnd = state.previousCharacters.containsAt(endOfComment, 8); | |
| 265 | boolean isInsideTag = state.isInsideTag; | |
| 266 | boolean hasEncounteredTagName = state.hasEncounteredTagName; | |
| 267 |
3
1. determineCommentState : negated conditional → KILLED 2. determineCommentState : negated conditional → KILLED 3. determineCommentState : negated conditional → KILLED |
if (isInsideTag && !hasEncounteredTagName && atCommentStart) { |
| 268 | state.isInsideComment = true; | |
| 269 | state.isInsideTag = false; | |
| 270 |
2
1. determineCommentState : negated conditional → KILLED 2. determineCommentState : negated conditional → KILLED |
} else if (state.isInsideComment && atCommentEnd) { |
| 271 | state.isInsideComment = false; | |
| 272 | } | |
| 273 | } | |
| 274 | ||
| 275 | static final List<Character> scriptElement = List.of('<','/','s','c','r','i','p','t','>'); | |
| 276 | ||
| 277 | /** | |
| 278 | * Determines whether we have hit the end of the script block | |
| 279 | * by looking for the closing script tag. | |
| 280 | */ | |
| 281 | private void determineScriptState(State state) { | |
| 282 | boolean isScriptFinished = state.previousCharacters.containsAt(scriptElement, 3); | |
| 283 | boolean wasInsideScript = state.isInsideScript; | |
| 284 |
2
1. determineScriptState : negated conditional → KILLED 2. determineScriptState : negated conditional → KILLED |
state.isInsideScript = state.isInsideScript && !isScriptFinished; |
| 285 |
2
1. determineScriptState : negated conditional → KILLED 2. determineScriptState : negated conditional → KILLED |
boolean justClosedScriptTag = wasInsideScript && !state.isInsideScript; |
| 286 |
1
1. determineScriptState : negated conditional → KILLED |
if (justClosedScriptTag) { |
| 287 | state.tagName = "script"; | |
| 288 | state.isInsideTag = true; | |
| 289 | state.isStartTag = false; | |
| 290 | var innerTextLength = state.stringBuilder.length(); | |
| 291 |
1
1. determineScriptState : Replaced integer subtraction with addition → KILLED |
state.stringBuilder.delete(innerTextLength - 8, innerTextLength); |
| 292 |
1
1. determineScriptState : removed call to com/renomad/minum/htmlparsing/HtmlParser::addText → KILLED |
addText(state); |
| 293 | } | |
| 294 | ||
| 295 | } | |
| 296 | ||
| 297 | /** | |
| 298 | * at this point we have a tagname for our tag, and we're still in the tag | |
| 299 | */ | |
| 300 | private static void handleAfterReadingTagName(State state, char currentChar) { | |
| 301 | ||
| 302 | boolean isHandlingAttributes = isHandlingAttributes(state, currentChar); | |
| 303 |
1
1. handleAfterReadingTagName : negated conditional → KILLED |
if (isHandlingAttributes) { |
| 304 | ||
| 305 |
1
1. handleAfterReadingTagName : negated conditional → KILLED |
if (state.currentAttributeKey.isBlank()) { |
| 306 | /* | |
| 307 | because the key is blank, we know we haven't read it all. That's | |
| 308 | because when we finish reading the key, we'll add it to currentAttributeKey | |
| 309 | and be in the mode of reading the value. | |
| 310 | */ | |
| 311 |
1
1. handleAfterReadingTagName : removed call to com/renomad/minum/htmlparsing/HtmlParser::handleNotFullyReadAttributeKey → KILLED |
handleNotFullyReadAttributeKey(state, currentChar); |
| 312 | } else { | |
| 313 | // reading in the (potential) attribute value | |
| 314 | ||
| 315 |
1
1. handleAfterReadingTagName : removed call to com/renomad/minum/htmlparsing/HtmlParser::handlePotentialAttributeValue → KILLED |
handlePotentialAttributeValue(state, currentChar); |
| 316 | } | |
| 317 | } | |
| 318 | } | |
| 319 | ||
| 320 | /** | |
| 321 | * Check whether we're past the whitespace between the tag name and | |
| 322 | * the start of the (potential) attribute key. | |
| 323 | */ | |
| 324 | static boolean isHandlingAttributes(State state, char currentChar) { | |
| 325 |
2
1. isHandlingAttributes : replaced boolean return with true for com/renomad/minum/htmlparsing/HtmlParser::isHandlingAttributes → KILLED 2. isHandlingAttributes : negated conditional → KILLED |
return ! (state.currentAttributeKey.isEmpty() && |
| 326 |
2
1. isHandlingAttributes : negated conditional → KILLED 2. isHandlingAttributes : negated conditional → KILLED |
state.stringBuilder.isEmpty() |
| 327 | && currentChar == ' '); | |
| 328 | } | |
| 329 | ||
| 330 | private static void handlePotentialAttributeValue(State state, char currentChar) { | |
| 331 |
1
1. handlePotentialAttributeValue : negated conditional → KILLED |
if (state.isInsideAttributeValueQuoted) { |
| 332 | // if we're already inside a quoted area, encountering a | |
| 333 | // closing quote will take us out of it. | |
| 334 |
1
1. handlePotentialAttributeValue : negated conditional → KILLED |
if (currentChar == state.quoteType.literal) { |
| 335 | // if we hit the matching end-quote, switch modes | |
| 336 | state.isInsideAttributeValueQuoted = false; | |
| 337 | state.quoteType = QuoteType.NONE; | |
| 338 | state.attributes.put(state.currentAttributeKey, state.stringBuilder.toString()); | |
| 339 | state.stringBuilder = new StringBuilder(); | |
| 340 | state.currentAttributeKey = ""; | |
| 341 | state.isReadingAttributeKey = false; | |
| 342 | } else { | |
| 343 | // otherwise keep on trucking, adding characters | |
| 344 | state.stringBuilder.append(currentChar); | |
| 345 | } | |
| 346 | } else { | |
| 347 |
2
1. handlePotentialAttributeValue : negated conditional → KILLED 2. handlePotentialAttributeValue : negated conditional → KILLED |
if (currentChar == '"' || currentChar == '\'') { |
| 348 | /* | |
| 349 | if we're not currently inside a quoted area but encounter | |
| 350 | a quote, switch modes. | |
| 351 | */ | |
| 352 | state.isInsideAttributeValueQuoted = true; | |
| 353 | state.quoteType = QuoteType.byLiteral(currentChar); | |
| 354 |
2
1. handlePotentialAttributeValue : negated conditional → KILLED 2. handlePotentialAttributeValue : negated conditional → KILLED |
} else if (!state.stringBuilder.isEmpty() && currentChar == ' ') { |
| 355 | /* | |
| 356 | if we're not in a quoted area and encounter a space, then | |
| 357 | we're done reading the attribute value and can add the key-value | |
| 358 | pair to the map. | |
| 359 | */ | |
| 360 | state.attributes.put(state.currentAttributeKey, state.stringBuilder.toString()); | |
| 361 | state.isReadingAttributeKey = false; | |
| 362 | state.stringBuilder = new StringBuilder(); | |
| 363 | state.currentAttributeKey = ""; | |
| 364 | } else { | |
| 365 | // otherwise keep trucking along adding characters | |
| 366 | state.stringBuilder.append(currentChar); | |
| 367 | } | |
| 368 | } | |
| 369 | } | |
| 370 | ||
| 371 | private static void handleNotFullyReadAttributeKey(State state, char currentChar) { | |
| 372 |
1
1. handleNotFullyReadAttributeKey : negated conditional → KILLED |
if (state.isHalfClosedTag) { |
| 373 | /* | |
| 374 | This situation occurs when we are in a void tag, like <link />, | |
| 375 | and are closing the tag with a forward slash + closing bracket. | |
| 376 | ||
| 377 | if we got here, it means the previous char was | |
| 378 | a forward slash, so the current character *should* | |
| 379 | be a closing angle, but if it's not ... | |
| 380 | */ | |
| 381 | throw new ParsingException(String.format("in closing a void tag (e.g. <link />), character after forward slash must be angle bracket. Char: %s at line %d and at the %d character. %d chars read in total.", currentChar, state.lineRow, state.lineColumn, state.charsRead)); | |
| 382 |
2
1. handleNotFullyReadAttributeKey : negated conditional → KILLED 2. handleNotFullyReadAttributeKey : negated conditional → KILLED |
} else if (currentChar == ' ' || currentChar == '=') { |
| 383 | // if we hit whitespace or an equals sign, we're done reading the key | |
| 384 | state.currentAttributeKey = state.stringBuilder.toString(); | |
| 385 | state.isReadingAttributeKey = false; | |
| 386 | state.stringBuilder = new StringBuilder(); | |
| 387 |
1
1. handleNotFullyReadAttributeKey : negated conditional → KILLED |
} else if (currentChar == '/') { |
| 388 | // a forward-slash cannot be in the attribute key | |
| 389 | state.isReadingAttributeKey = false; | |
| 390 | state.isHalfClosedTag = true; | |
| 391 | } else { | |
| 392 | // otherwise keep on reading | |
| 393 | state.stringBuilder.append(currentChar); | |
| 394 | // and note we are reading the key | |
| 395 | state.isReadingAttributeKey = true; | |
| 396 | } | |
| 397 | } | |
| 398 | ||
| 399 | private static void handleReadingTagName(State state, char currentChar) { | |
| 400 |
1
1. handleReadingTagName : negated conditional → KILLED |
if (Character.isWhitespace(currentChar)) { |
| 401 | /* | |
| 402 | At this point, we've been reading the tag name, and we've encountered whitespace. | |
| 403 | That means we are done reading the tag name | |
| 404 | */ | |
| 405 | state.hasEncounteredTagName = true; | |
| 406 | state.isReadingTagName = false; | |
| 407 | state.tagName = state.stringBuilder.toString(); | |
| 408 | state.attributes = new HashMap<>(); | |
| 409 | state.stringBuilder = new StringBuilder(); | |
| 410 | } else { | |
| 411 | /* | |
| 412 | Reading the characters of the tag name | |
| 413 | */ | |
| 414 | state.hasEncounteredTagName = true; | |
| 415 | state.tagName = ""; | |
| 416 | state.stringBuilder.append(currentChar); | |
| 417 | } | |
| 418 | } | |
| 419 | ||
| 420 | /** | |
| 421 | * We're just past a starting angle bracket, so we're | |
| 422 | * feeling our way around what this element is. | |
| 423 | */ | |
| 424 | private static void handleBeforeReadingTagName(State state, char currentChar) { | |
| 425 |
1
1. handleBeforeReadingTagName : negated conditional → KILLED |
if (currentChar == ' ') { |
| 426 | /* | |
| 427 | At this point, we're inside the tag, and we've encountered whitespace. | |
| 428 | Seeking the tag name (although we may be inside a closing tag). | |
| 429 | */ | |
| 430 | state.stringBuilder = new StringBuilder(); | |
| 431 |
1
1. handleBeforeReadingTagName : negated conditional → KILLED |
} else if (currentChar == '/') { |
| 432 | /* | |
| 433 | hitting a forward-slash symbol means we're looking | |
| 434 | at the closure of a tag | |
| 435 | */ | |
| 436 | state.isStartTag = false; | |
| 437 | state.stringBuilder = new StringBuilder(); | |
| 438 |
1
1. handleBeforeReadingTagName : negated conditional → KILLED |
} else if (Character.isAlphabetic(currentChar)) { |
| 439 | ||
| 440 | /* | |
| 441 | Here, our input could definitely be the letters of a tag name | |
| 442 | */ | |
| 443 | state.hasEncounteredTagName = true; | |
| 444 | state.isReadingTagName = true; | |
| 445 | state.stringBuilder.append(currentChar); | |
| 446 | } | |
| 447 | } | |
| 448 | ||
| 449 | /** | |
| 450 | * This examines the results of reading a tag - if it's | |
| 451 | * a start tag, it pushes it onto a stack for later | |
| 452 | * comparison to the end tag. The stack is a key | |
| 453 | * component of how we are able to nest the tags properly. | |
| 454 | */ | |
| 455 | private void processTag(State state, List<HtmlParseNode> nodes) { | |
| 456 | String tagNameString = state.tagName; | |
| 457 | TagName tagName; | |
| 458 | ||
| 459 | tagName = TagName.findMatchingTagname(tagNameString); | |
| 460 |
1
1. processTag : negated conditional → KILLED |
if (tagName.equals(TagName.UNRECOGNIZED)) return; |
| 461 | var tagInfo = new TagInfo(tagName, state.attributes); | |
| 462 |
1
1. processTag : negated conditional → KILLED |
if (state.isStartTag) { |
| 463 | HtmlParseNode newNode = new HtmlParseNode(ParseNodeType.ELEMENT, tagInfo, new ArrayList<>(), ""); | |
| 464 | ||
| 465 |
1
1. processTag : negated conditional → KILLED |
if (! state.parseStack.isEmpty()) { |
| 466 | // if we're inside an html element, | |
| 467 | // add this to the inner content | |
| 468 |
1
1. processTag : removed call to com/renomad/minum/htmlparsing/HtmlParseNode::addToInnerContent → KILLED |
state.parseStack.peek().addToInnerContent(newNode); |
| 469 | } | |
| 470 | ||
| 471 |
2
1. processTag : negated conditional → KILLED 2. processTag : negated conditional → KILLED |
if (state.parseStack.isEmpty() && tagName.isVoidElement) { |
| 472 | // if we're at the root level and encountering a void element, | |
| 473 | // add it to the root-level list of nodes | |
| 474 | nodes.add(newNode); | |
| 475 |
1
1. processTag : negated conditional → KILLED |
} else if (!tagName.isVoidElement) { |
| 476 |
1
1. processTag : removed call to java/util/Deque::push → KILLED |
state.parseStack.push(newNode); |
| 477 | } | |
| 478 | ||
| 479 |
1
1. processTag : negated conditional → KILLED |
if (tagName.equals(TagName.SCRIPT)) { |
| 480 | state.isInsideScript = true; | |
| 481 | state.stringBuilder = new StringBuilder(); | |
| 482 | } | |
| 483 | } else { | |
| 484 | // if we're leaving an end-tag, it means we have a | |
| 485 | // full element with potentially inner content | |
| 486 | HtmlParseNode htmlParseNode; | |
| 487 | try { | |
| 488 | htmlParseNode = state.parseStack.pop(); | |
| 489 | } catch (NoSuchElementException ex) { | |
| 490 | throw new ParsingException("No starting tag found. At line " + state.lineRow + " and at the " + state.lineColumn + "th character. " + state.charsRead + " characters read in total."); | |
| 491 | } | |
| 492 | ||
| 493 | /* | |
| 494 | If the stack is a size of zero at this point, it means we're at the | |
| 495 | roots of our HTML code, which means it's the proper time to add the | |
| 496 | topmost element we just popped into a list. | |
| 497 | */ | |
| 498 |
1
1. processTag : negated conditional → KILLED |
if (state.parseStack.isEmpty()) { |
| 499 | nodes.add(htmlParseNode); | |
| 500 | } | |
| 501 | TagName expectedTagName = htmlParseNode.getTagInfo().getTagName(); | |
| 502 |
1
1. processTag : negated conditional → KILLED |
if (expectedTagName != tagName) { |
| 503 | throw new ParsingException("Did not find expected closing-tag type. " + "Expected: " + expectedTagName + " at line " + state.lineRow + " and at the " + state.lineColumn + "th character. " + state.charsRead + " characters read in total."); | |
| 504 | } | |
| 505 | } | |
| 506 | } | |
| 507 | ||
| 508 | enum QuoteType { | |
| 509 | SINGLE_QUOTED('\''), DOUBLE_QUOTED('"'), NONE(Character.MIN_VALUE); | |
| 510 | ||
| 511 | public final char literal; | |
| 512 | ||
| 513 | QuoteType(char literal) { | |
| 514 | this.literal = literal; | |
| 515 | } | |
| 516 | ||
| 517 | public static QuoteType byLiteral(char currentChar) { | |
| 518 |
1
1. byLiteral : negated conditional → KILLED |
if (currentChar == '\'') { |
| 519 |
1
1. byLiteral : replaced return value with null for com/renomad/minum/htmlparsing/HtmlParser$QuoteType::byLiteral → KILLED |
return QuoteType.SINGLE_QUOTED; |
| 520 | } else { | |
| 521 |
1
1. byLiteral : replaced return value with null for com/renomad/minum/htmlparsing/HtmlParser$QuoteType::byLiteral → KILLED |
return QuoteType.DOUBLE_QUOTED; |
| 522 | } | |
| 523 | } | |
| 524 | } | |
| 525 | ||
| 526 | static class State { | |
| 527 | ||
| 528 | static State buildNewState() { | |
| 529 | RingBuffer<Character> previousCharacters = new RingBuffer<>(12, Character.class); | |
| 530 | int lineColumn1 = 0; | |
| 531 | int lineRow1 = 1; | |
| 532 | boolean isHalfClosedTag1 = false; | |
| 533 | boolean isInsideAttributeValueQuoted1 = false; | |
| 534 | boolean isStartTag1 = true; | |
| 535 | boolean isReadingTagName1 = false; | |
| 536 | boolean hasEncounteredTagName1 = false; | |
| 537 | ArrayDeque<HtmlParseNode> parseStack1 = new ArrayDeque<>(); | |
| 538 | StringBuilder stringBuilder1 = new StringBuilder(); | |
| 539 | boolean isInsideTag1 = false; | |
| 540 | int charsRead1 = 0; | |
| 541 | String tagName1 = ""; | |
| 542 | String currentAttributeKey1 = ""; | |
| 543 | HashMap<String, String> attributes1 = new HashMap<>(); | |
| 544 | boolean isReadingAttributeKey1 = false; | |
| 545 | boolean isInsideComment1 = false; | |
| 546 | boolean isInsideScript1 = false; | |
| 547 |
1
1. buildNewState : replaced return value with null for com/renomad/minum/htmlparsing/HtmlParser$State::buildNewState → KILLED |
return new State(charsRead1, isInsideTag1, stringBuilder1, parseStack1, hasEncounteredTagName1, |
| 548 | isReadingTagName1, isStartTag1, isInsideAttributeValueQuoted1, | |
| 549 | tagName1, currentAttributeKey1, attributes1, QuoteType.NONE, isReadingAttributeKey1, | |
| 550 | isHalfClosedTag1, lineRow1, lineColumn1, previousCharacters, isInsideComment1, isInsideScript1); | |
| 551 | } | |
| 552 | ||
| 553 | /** | |
| 554 | * If we encounter a forward-slash in a tag, and we're | |
| 555 | * not in the midst of reading an attribute value, then | |
| 556 | * we expect the next character to be a greater-than symbol. | |
| 557 | */ | |
| 558 | boolean isHalfClosedTag; | |
| 559 | /** | |
| 560 | * total number of chars read of this HTML file | |
| 561 | */ | |
| 562 | int charsRead; | |
| 563 | /** | |
| 564 | * True if we are inside angle brackets (may be a closing tag) | |
| 565 | */ | |
| 566 | boolean isInsideTag; | |
| 567 | /** | |
| 568 | * Where we build up tokens a character at a time | |
| 569 | */ | |
| 570 | StringBuilder stringBuilder; | |
| 571 | /** | |
| 572 | * A stack of HtmlParseNodes, used to see how far deep in the tree we are | |
| 573 | */ | |
| 574 | final Deque<HtmlParseNode> parseStack; | |
| 575 | /** | |
| 576 | * True if we have successfully encountered the first letter of the tag | |
| 577 | */ | |
| 578 | boolean hasEncounteredTagName; | |
| 579 | /** | |
| 580 | * True if we are in the process of reading the tag (e.g. p, a, h1, etc) | |
| 581 | */ | |
| 582 | boolean isReadingTagName; | |
| 583 | ||
| 584 | /** | |
| 585 | * if we determine we are in the midst of reading an attribute key | |
| 586 | */ | |
| 587 | boolean isReadingAttributeKey; | |
| 588 | ||
| 589 | /** | |
| 590 | * True if we determine we are probably in the start tag (rather than the closing tag) | |
| 591 | */ | |
| 592 | boolean isStartTag; | |
| 593 | /** | |
| 594 | * True if we're inside the quoted area inside an attribute value in an element | |
| 595 | * tag - this could be where we encounter some symbols that may not be allowed elsewhere. | |
| 596 | */ | |
| 597 | boolean isInsideAttributeValueQuoted; | |
| 598 | /** | |
| 599 | * If we're in a quoted area, it's either single or double-quoted. | |
| 600 | * These quotes need to be paired properly, so we need to keep track. | |
| 601 | */ | |
| 602 | QuoteType quoteType; | |
| 603 | /** | |
| 604 | * The string value of the tag name | |
| 605 | */ | |
| 606 | String tagName; | |
| 607 | /** | |
| 608 | * The attribute key we just read | |
| 609 | */ | |
| 610 | String currentAttributeKey; | |
| 611 | /** | |
| 612 | * a map of string to values (in some cases there won't be an equals | |
| 613 | * sign, meaning the value is null. In other cases there will be an | |
| 614 | * equals sign but no value, meaning the value is empty string) | |
| 615 | */ | |
| 616 | Map<String, String> attributes; | |
| 617 | ||
| 618 | /** | |
| 619 | * indicate which line we're on in debugging | |
| 620 | */ | |
| 621 | int lineRow; | |
| 622 | ||
| 623 | /** | |
| 624 | * How far we are from the last newline character, including | |
| 625 | * all whitespace as well. | |
| 626 | */ | |
| 627 | int lineColumn; | |
| 628 | ||
| 629 | /** | |
| 630 | * This is used to check for comments and script tags, like: | |
| 631 | * {@code <!-- -->} and {@code <script>} | |
| 632 | */ | |
| 633 | final RingBuffer<Character> previousCharacters; | |
| 634 | ||
| 635 | /** | |
| 636 | * Indicates whether we are inside a comment | |
| 637 | */ | |
| 638 | boolean isInsideComment; | |
| 639 | ||
| 640 | boolean isInsideScript; | |
| 641 | ||
| 642 | /** | |
| 643 | * Holds the state so we can remember where we are as we examine the HTML | |
| 644 | * a character at a time. | |
| 645 | */ | |
| 646 | public State(int charsRead, boolean isInsideTag, StringBuilder stringBuilder, | |
| 647 | Deque<HtmlParseNode> parseStack, boolean hasEncounteredTagName, boolean isReadingTagName, | |
| 648 | boolean isStartTag, boolean isInsideAttributeValueQuoted, String tagName, | |
| 649 | String currentAttributeKey, Map<String, String> attributes, QuoteType quoteType, | |
| 650 | boolean isReadingAttributeKey, boolean isHalfClosedTag, int lineRow, int lineColumn, | |
| 651 | RingBuffer<Character> previousCharacters, boolean isInsideComment, boolean isInsideScript) { | |
| 652 | ||
| 653 | this.charsRead = charsRead; | |
| 654 | this.isInsideTag = isInsideTag; | |
| 655 | this.stringBuilder = stringBuilder; | |
| 656 | this.parseStack = parseStack; | |
| 657 | this.hasEncounteredTagName = hasEncounteredTagName; | |
| 658 | this.isReadingTagName = isReadingTagName; | |
| 659 | this.isStartTag = isStartTag; | |
| 660 | this.isInsideAttributeValueQuoted = isInsideAttributeValueQuoted; | |
| 661 | this.tagName = tagName; | |
| 662 | this.currentAttributeKey = currentAttributeKey; | |
| 663 | this.attributes = attributes; | |
| 664 | this.quoteType = quoteType; | |
| 665 | this.isReadingAttributeKey = isReadingAttributeKey; | |
| 666 | this.isHalfClosedTag = isHalfClosedTag; | |
| 667 | this.lineRow = lineRow; | |
| 668 | this.lineColumn = lineColumn; | |
| 669 | this.previousCharacters = previousCharacters; | |
| 670 | this.isInsideComment = isInsideComment; | |
| 671 | this.isInsideScript = isInsideScript; | |
| 672 | } | |
| 673 | } | |
| 674 | ||
| 675 | /** | |
| 676 | * Search the node tree for matching elements. | |
| 677 | * <p> | |
| 678 | * If zero nodes are found, returns an empty list. | |
| 679 | * </p> | |
| 680 | */ | |
| 681 | public List<HtmlParseNode> search(List<HtmlParseNode> nodes, TagName tagName, Map<String, String> attributes) { | |
| 682 | List<HtmlParseNode> foundNodes = new ArrayList<>(); | |
| 683 | for (var node : nodes) { | |
| 684 | var result = node.search(tagName, attributes); | |
| 685 | foundNodes.addAll(result); | |
| 686 | } | |
| 687 |
1
1. search : replaced return value with Collections.emptyList for com/renomad/minum/htmlparsing/HtmlParser::search → KILLED |
return foundNodes; |
| 688 | } | |
| 689 | ||
| 690 | } | |
Mutations | ||
| 59 |
1.1 2.2 |
|
| 66 |
1.1 |
|
| 68 |
1.1 |
|
| 84 |
1.1 |
|
| 87 |
1.1 |
|
| 88 |
1.1 |
|
| 89 |
1.1 |
|
| 90 |
1.1 |
|
| 93 |
1.1 |
|
| 98 |
1.1 |
|
| 99 |
1.1 |
|
| 100 |
1.1 |
|
| 101 |
1.1 |
|
| 103 |
1.1 |
|
| 112 |
1.1 |
|
| 113 |
1.1 |
|
| 114 |
1.1 |
|
| 117 |
1.1 |
|
| 122 |
1.1 |
|
| 123 |
1.1 |
|
| 138 |
1.1 |
|
| 145 |
1.1 |
|
| 150 |
1.1 |
|
| 152 |
1.1 2.2 3.3 |
|
| 156 |
1.1 |
|
| 158 |
1.1 |
|
| 168 |
1.1 |
|
| 172 |
1.1 2.2 3.3 4.4 |
|
| 177 |
1.1 |
|
| 184 |
1.1 |
|
| 192 |
1.1 |
|
| 206 |
1.1 |
|
| 211 |
1.1 2.2 |
|
| 212 |
1.1 |
|
| 222 |
1.1 |
|
| 238 |
1.1 2.2 |
|
| 240 |
1.1 |
|
| 241 |
1.1 |
|
| 242 |
1.1 |
|
| 243 |
1.1 |
|
| 244 |
1.1 |
|
| 245 |
1.1 |
|
| 252 |
1.1 2.2 3.3 |
|
| 267 |
1.1 2.2 3.3 |
|
| 270 |
1.1 2.2 |
|
| 284 |
1.1 2.2 |
|
| 285 |
1.1 2.2 |
|
| 286 |
1.1 |
|
| 291 |
1.1 |
|
| 292 |
1.1 |
|
| 303 |
1.1 |
|
| 305 |
1.1 |
|
| 311 |
1.1 |
|
| 315 |
1.1 |
|
| 325 |
1.1 2.2 |
|
| 326 |
1.1 2.2 |
|
| 331 |
1.1 |
|
| 334 |
1.1 |
|
| 347 |
1.1 2.2 |
|
| 354 |
1.1 2.2 |
|
| 372 |
1.1 |
|
| 382 |
1.1 2.2 |
|
| 387 |
1.1 |
|
| 400 |
1.1 |
|
| 425 |
1.1 |
|
| 431 |
1.1 |
|
| 438 |
1.1 |
|
| 460 |
1.1 |
|
| 462 |
1.1 |
|
| 465 |
1.1 |
|
| 468 |
1.1 |
|
| 471 |
1.1 2.2 |
|
| 475 |
1.1 |
|
| 476 |
1.1 |
|
| 479 |
1.1 |
|
| 498 |
1.1 |
|
| 502 |
1.1 |
|
| 518 |
1.1 |
|
| 519 |
1.1 |
|
| 521 |
1.1 |
|
| 547 |
1.1 |
|
| 687 |
1.1 |