View Javadoc
1   /*
2    * Copyright (c) 2002-2026 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html;
16  
17  import java.io.Serializable;
18  
19  import org.w3c.dom.DOMException;
20  import org.w3c.dom.Node;
21  import org.w3c.dom.traversal.NodeFilter;
22  
23  /**
24   * In general this is an implementation of org.w3c.dom.traversal.TreeWalker.
25   * The class org.w3c.dom.traversal.TreeWalker is not available on Android
26   * therefore we have this impl as backend.
27   *
28   * @see <a href="http://www.w3.org/TR/DOM-Level-2-Traversal-Range/traversal.html">
29   *     DOM-Level-2-Traversal-Range</a>
30   * @author Mike Dirolf
31   * @author Frank Danek
32   * @author Ahmed Ashour
33   * @author Ronald Brill
34   */
35  public class HtmlDomTreeWalker implements Serializable {
36  
37      private final DomNode root_;
38      private DomNode currentNode_;
39      private final int whatToShow_;
40      private final NodeFilter filter_;
41      private final boolean expandEntityReferences_;
42  
43      /**
44       * Creates an instance.
45       *
46       * @param root The root node of the TreeWalker. Must not be
47       *          {@code null}.
48       * @param whatToShow Flag specifying which types of nodes appear in the
49       *          logical view of the TreeWalker. See {@link NodeFilter} for the
50       *          set of possible Show_ values.
51       * @param filter The {@link NodeFilter} to be used with this TreeWalker,
52       *          or {@code null} to indicate no filter.
53       * @param expandEntityReferences If false, the contents of
54       *          EntityReference nodes are not present in the logical view.
55       * @throws DOMException on attempt to create a TreeWalker with a root that
56       *          is {@code null}.
57       */
58      public HtmlDomTreeWalker(final DomNode root, final int whatToShow, final NodeFilter filter,
59              final boolean expandEntityReferences) throws DOMException {
60          if (root == null) {
61              throw new IllegalArgumentException("root must not be null");
62          }
63          root_ = root;
64          whatToShow_ = whatToShow;
65          filter_ = filter;
66          expandEntityReferences_ = expandEntityReferences;
67          currentNode_ = root_;
68      }
69  
70      /**
71       * @see org.w3c.dom.traversal.TreeWalker#getRoot()
72       * @return the root node
73       */
74      public DomNode getRoot() {
75          return root_;
76      }
77  
78      /**
79       * @see org.w3c.dom.traversal.TreeWalker#getWhatToShow()
80       * @return NodeFilter constant
81       */
82      public int getWhatToShow() {
83          return whatToShow_;
84      }
85  
86      /**
87       * @see org.w3c.dom.traversal.TreeWalker#getFilter()
88       * @return the filter
89       */
90      public NodeFilter getFilter() {
91          return filter_;
92      }
93  
94      /**
95       * @see org.w3c.dom.traversal.TreeWalker#getExpandEntityReferences()
96       * @return the ExpandEntityReferences setting
97       */
98      @SuppressWarnings("PMD.BooleanGetMethodName")
99      public boolean getExpandEntityReferences() {
100         return expandEntityReferences_;
101     }
102 
103     /**
104      * @see org.w3c.dom.traversal.TreeWalker#getCurrentNode()
105      * @return the current node
106      */
107     public DomNode getCurrentNode() {
108         return currentNode_;
109     }
110 
111     /**
112      * @see org.w3c.dom.traversal.TreeWalker#setCurrentNode(Node)
113      * @param currentNode the current node
114      * @throws DOMException if the current node provides is null
115      */
116     public void setCurrentNode(final Node currentNode) throws DOMException {
117         if (currentNode == null) {
118             throw new DOMException(DOMException.NOT_SUPPORTED_ERR,
119                     "currentNode cannot be set to null");
120         }
121         currentNode_ = (DomNode) currentNode;
122     }
123 
124     /**
125      * @see org.w3c.dom.traversal.TreeWalker#nextNode()
126      * @return the next node
127      */
128     public DomNode nextNode() {
129         final DomNode leftChild = getEquivalentLogical(currentNode_.getFirstChild(), false);
130         if (leftChild != null) {
131             currentNode_ = leftChild;
132             return leftChild;
133         }
134         final DomNode rightSibling = getEquivalentLogical(currentNode_.getNextSibling(), false);
135         if (rightSibling != null) {
136             currentNode_ = rightSibling;
137             return rightSibling;
138         }
139 
140         final DomNode uncle = getFirstUncleNode(currentNode_);
141         if (uncle != null) {
142             currentNode_ = uncle;
143             return uncle;
144         }
145 
146         return null;
147     }
148 
149     /**
150      * Helper method to get the first uncle node in document order (preorder
151      * traversal) from the given node.
152      */
153     private DomNode getFirstUncleNode(final DomNode n) {
154         if (n == root_ || n == null) {
155             return null;
156         }
157 
158         final DomNode parent = n.getParentNode();
159         if (parent == null) {
160             return null;
161         }
162 
163         final DomNode uncle = getEquivalentLogical(parent.getNextSibling(), false);
164         if (uncle != null) {
165             return uncle;
166         }
167 
168         return getFirstUncleNode(parent);
169     }
170 
171     /**
172      * Recursively find the logical node occupying the same position as this
173      * _actual_ node. It could be the same node, a different node, or null
174      * depending on filtering.
175      *
176      * @param n The actual node we are trying to find the "equivalent" of
177      * @param lookLeft If true, traverse the tree in the left direction. If
178      *          false, traverse the tree to the right.
179      * @return the logical node in the same position as n
180      */
181     private DomNode getEquivalentLogical(final DomNode n, final boolean lookLeft) {
182         // Base cases
183         if (n == null) {
184             return null;
185         }
186         if (isNodeVisible(n)) {
187             return n;
188         }
189 
190         // If a node is skipped, try getting one of its descendants
191         if (isNodeSkipped(n)) {
192             final DomNode child;
193             if (lookLeft) {
194                 child = getEquivalentLogical(n.getLastChild(), lookLeft);
195             }
196             else {
197                 child = getEquivalentLogical(n.getFirstChild(), lookLeft);
198             }
199 
200             if (child != null) {
201                 return child;
202             }
203         }
204 
205         // If this node is rejected or has no descendants that will work, go
206         // to its sibling.
207         return getSibling(n, lookLeft);
208     }
209 
210     /**
211      * Returns whether the node is visible by the TreeWalker.
212      */
213     private boolean isNodeVisible(final Node n) {
214         if (acceptNode(n) == NodeFilter.FILTER_ACCEPT) {
215             if (filter_ == null || filter_.acceptNode(n) == NodeFilter.FILTER_ACCEPT) {
216                 return expandEntityReferences_ || n.getParentNode() == null
217                         || n.getParentNode().getNodeType() != Node.ENTITY_REFERENCE_NODE;
218             }
219         }
220         return false;
221     }
222 
223     /**
224      * Test whether a specified node is visible in the logical view of a
225      * TreeWalker, based solely on the whatToShow constant.
226      *
227      * @param n The node to check to see if it should be shown or not
228      * @return a constant to determine whether the node is accepted, rejected,
229      *          or skipped.
230      */
231     private short acceptNode(final Node n) {
232         final int flag = getFlagForNode(n);
233 
234         if ((whatToShow_ & flag) != 0) {
235             return NodeFilter.FILTER_ACCEPT;
236         }
237         // Skip, don't reject.
238         return NodeFilter.FILTER_SKIP;
239     }
240 
241     /**
242      * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
243      *
244      * Given a {@link Node}, return the appropriate constant for whatToShow.
245      *
246      * @param node the node
247      * @return the whatToShow constant for the type of specified node
248      */
249     public static int getFlagForNode(final Node node) {
250         return switch (node.getNodeType()) {
251             case Node.ELEMENT_NODE -> NodeFilter.SHOW_ELEMENT;
252             case Node.ATTRIBUTE_NODE -> NodeFilter.SHOW_ATTRIBUTE;
253             case Node.TEXT_NODE -> NodeFilter.SHOW_TEXT;
254             case Node.CDATA_SECTION_NODE -> NodeFilter.SHOW_CDATA_SECTION;
255             case Node.ENTITY_REFERENCE_NODE -> NodeFilter.SHOW_ENTITY_REFERENCE;
256             case Node.ENTITY_NODE -> NodeFilter.SHOW_ENTITY;
257             case Node.PROCESSING_INSTRUCTION_NODE -> NodeFilter.SHOW_PROCESSING_INSTRUCTION;
258             case Node.COMMENT_NODE -> NodeFilter.SHOW_COMMENT;
259             case Node.DOCUMENT_NODE -> NodeFilter.SHOW_DOCUMENT;
260             case Node.DOCUMENT_TYPE_NODE -> NodeFilter.SHOW_DOCUMENT_TYPE;
261             case Node.DOCUMENT_FRAGMENT_NODE -> NodeFilter.SHOW_DOCUMENT_FRAGMENT;
262             case Node.NOTATION_NODE -> NodeFilter.SHOW_NOTATION;
263             default -> 0;
264         };
265     }
266 
267     /* Returns whether the node is skipped by the TreeWalker. */
268     private boolean isNodeSkipped(final Node n) {
269         return !isNodeVisible(n) && !isNodeRejected(n);
270     }
271 
272     /* Returns whether the node is rejected by the TreeWalker. */
273     private boolean isNodeRejected(final Node n) {
274         if (acceptNode(n) == NodeFilter.FILTER_REJECT) {
275             return true;
276         }
277         if (filter_ != null && filter_.acceptNode(n) == NodeFilter.FILTER_REJECT) {
278             return true;
279         }
280         return !expandEntityReferences_ && n.getParentNode() != null
281                 && n.getParentNode().getNodeType() == Node.ENTITY_REFERENCE_NODE;
282     }
283 
284     // Helper method for getEquivalentLogical
285     private DomNode getSibling(final DomNode n, final boolean lookLeft) {
286         if (n == null) {
287             return null;
288         }
289 
290         if (isNodeVisible(n)) {
291             return null;
292         }
293 
294         final DomNode sibling;
295         if (lookLeft) {
296             sibling = n.getPreviousSibling();
297         }
298         else {
299             sibling = n.getNextSibling();
300         }
301 
302         if (sibling == null) {
303             // If this node has no logical siblings at or below it's "level", it might have one above
304             if (n == root_) {
305                 return null;
306             }
307             return getSibling(n.getParentNode(), lookLeft);
308 
309         }
310         return getEquivalentLogical(sibling, lookLeft);
311     }
312 
313     /**
314      * @see org.w3c.dom.traversal.TreeWalker#nextSibling()
315      * @return the next sibling node
316      */
317     public DomNode nextSibling() {
318         if (currentNode_ == root_) {
319             return null;
320         }
321 
322         final DomNode newNode = getEquivalentLogical(currentNode_.getNextSibling(), false);
323 
324         if (newNode != null) {
325             currentNode_ = newNode;
326         }
327 
328         return newNode;
329     }
330 
331     /**
332      * @see org.w3c.dom.traversal.TreeWalker#parentNode()
333      * @return the parent node
334      */
335     public DomNode parentNode() {
336         if (currentNode_ == root_) {
337             return null;
338         }
339 
340         DomNode newNode = currentNode_;
341 
342         do {
343             newNode = newNode.getParentNode();
344         }
345         while (newNode != null && !isNodeVisible(newNode) && newNode != root_);
346 
347         if (newNode == null || !isNodeVisible(newNode)) {
348             return null;
349         }
350         currentNode_ = newNode;
351         return newNode;
352     }
353 
354     /**
355      * @see org.w3c.dom.traversal.TreeWalker#previousSibling()
356      * @return the previous sibling node
357      */
358     public DomNode previousSibling() {
359         if (currentNode_ == root_) {
360             return null;
361         }
362 
363         final DomNode newNode = getEquivalentLogical(currentNode_.getPreviousSibling(), true);
364 
365         if (newNode != null) {
366             currentNode_ = newNode;
367         }
368 
369         return newNode;
370     }
371 
372     /**
373      * @see org.w3c.dom.traversal.TreeWalker#lastChild()
374      * @return the last child node
375      */
376     public DomNode lastChild() {
377         final DomNode newNode = getEquivalentLogical(currentNode_.getLastChild(), true);
378 
379         if (newNode != null) {
380             currentNode_ = newNode;
381         }
382 
383         return newNode;
384     }
385 
386     /**
387      * @see org.w3c.dom.traversal.TreeWalker#previousNode()
388      * @return the previous node
389      */
390     public DomNode previousNode() {
391         final DomNode newNode = getPreviousNode(currentNode_);
392 
393         if (newNode != null) {
394             currentNode_ = newNode;
395         }
396 
397         return newNode;
398     }
399 
400     /**
401      * Helper method to get the previous node in document order (preorder
402      * traversal) from the given node.
403      */
404     private DomNode getPreviousNode(final DomNode n) {
405         if (n == root_) {
406             return null;
407         }
408         final DomNode left = getEquivalentLogical(n.getPreviousSibling(), true);
409         if (left == null) {
410             final DomNode parent = n.getParentNode();
411             if (parent == null) {
412                 return null;
413             }
414             if (isNodeVisible(parent)) {
415                 return parent;
416             }
417         }
418 
419         DomNode follow = left;
420         if (follow != null) {
421             while (follow.hasChildNodes()) {
422                 final DomNode toFollow = getEquivalentLogical(follow.getLastChild(), true);
423                 if (toFollow == null) {
424                     break;
425                 }
426                 follow = toFollow;
427             }
428         }
429         return follow;
430     }
431 
432     /**
433      * @see org.w3c.dom.traversal.TreeWalker#firstChild()
434      * @return the first child node
435      */
436     public DomNode firstChild() {
437         final DomNode newNode = getEquivalentLogical(currentNode_.getFirstChild(), false);
438 
439         if (newNode != null) {
440             currentNode_ = newNode;
441         }
442 
443         return newNode;
444     }
445 }