View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html;
16  
17  import java.io.Serializable;
18  
19  import org.w3c.dom.DOMException;
20  import org.w3c.dom.Node;
21  import org.w3c.dom.traversal.NodeFilter;
22  
23  /**
24   * In general this is an implementation of org.w3c.dom.traversal.TreeWalker.
25   * The class org.w3c.dom.traversal.TreeWalker is not available on Android
26   * therefore we have this impl as backend.
27   *
28   * @see <a href="http://www.w3.org/TR/DOM-Level-2-Traversal-Range/traversal.html">
29   *     DOM-Level-2-Traversal-Range</a>
30   * @author Mike Dirolf
31   * @author Frank Danek
32   * @author Ahmed Ashour
33   * @author Ronald Brill
34   */
35  public class HtmlDomTreeWalker implements Serializable {
36  
37      private final DomNode root_;
38      private DomNode currentNode_;
39      private final int whatToShow_;
40      private final NodeFilter filter_;
41      private final boolean expandEntityReferences_;
42  
43      /**
44       * Creates an instance.
45       *
46       * @param root The root node of the TreeWalker. Must not be
47       *          {@code null}.
48       * @param whatToShow Flag specifying which types of nodes appear in the
49       *          logical view of the TreeWalker. See {@link NodeFilter} for the
50       *          set of possible Show_ values.
51       * @param filter The {@link NodeFilter} to be used with this TreeWalker,
52       *          or {@code null} to indicate no filter.
53       * @param expandEntityReferences If false, the contents of
54       *          EntityReference nodes are not present in the logical view.
55       * @throws DOMException on attempt to create a TreeWalker with a root that
56       *          is {@code null}.
57       */
58      public HtmlDomTreeWalker(final DomNode root, final int whatToShow, final NodeFilter filter,
59              final boolean expandEntityReferences) throws DOMException {
60          if (root == null) {
61              throw new IllegalArgumentException("root must not be null");
62          }
63          root_ = root;
64          whatToShow_ = whatToShow;
65          filter_ = filter;
66          expandEntityReferences_ = expandEntityReferences;
67          currentNode_ = root_;
68      }
69  
70      /**
71       * @see org.w3c.dom.traversal.TreeWalker#getRoot()
72       * @return the root node
73       */
74      public DomNode getRoot() {
75          return root_;
76      }
77  
78      /**
79       * @see org.w3c.dom.traversal.TreeWalker#getWhatToShow()
80       * @return NodeFilter constant
81       */
82      public int getWhatToShow() {
83          return whatToShow_;
84      }
85  
86      /**
87       * @see org.w3c.dom.traversal.TreeWalker#getFilter()
88       * @return the filter
89       */
90      public NodeFilter getFilter() {
91          return filter_;
92      }
93  
94      /**
95       * @see org.w3c.dom.traversal.TreeWalker#getExpandEntityReferences()
96       * @return the ExpandEntityReferences setting
97       */
98      @SuppressWarnings("PMD.BooleanGetMethodName")
99      public boolean getExpandEntityReferences() {
100         return expandEntityReferences_;
101     }
102 
103     /**
104      * @see org.w3c.dom.traversal.TreeWalker#getCurrentNode()
105      * @return the current node
106      */
107     public DomNode getCurrentNode() {
108         return currentNode_;
109     }
110 
111     /**
112      * @see org.w3c.dom.traversal.TreeWalker#setCurrentNode(Node)
113      * @param currentNode the current node
114      * @throws DOMException if the current node provides is null
115      */
116     public void setCurrentNode(final Node currentNode) throws DOMException {
117         if (currentNode == null) {
118             throw new DOMException(DOMException.NOT_SUPPORTED_ERR,
119                     "currentNode cannot be set to null");
120         }
121         currentNode_ = (DomNode) currentNode;
122     }
123 
124     /**
125      * @see org.w3c.dom.traversal.TreeWalker#nextNode()
126      * @return the next node
127      */
128     public DomNode nextNode() {
129         final DomNode leftChild = getEquivalentLogical(currentNode_.getFirstChild(), false);
130         if (leftChild != null) {
131             currentNode_ = leftChild;
132             return leftChild;
133         }
134         final DomNode rightSibling = getEquivalentLogical(currentNode_.getNextSibling(), false);
135         if (rightSibling != null) {
136             currentNode_ = rightSibling;
137             return rightSibling;
138         }
139 
140         final DomNode uncle = getFirstUncleNode(currentNode_);
141         if (uncle != null) {
142             currentNode_ = uncle;
143             return uncle;
144         }
145 
146         return null;
147     }
148 
149     /**
150      * Helper method to get the first uncle node in document order (preorder
151      * traversal) from the given node.
152      */
153     private DomNode getFirstUncleNode(final DomNode n) {
154         if (n == root_ || n == null) {
155             return null;
156         }
157 
158         final DomNode parent = n.getParentNode();
159         if (parent == null) {
160             return null;
161         }
162 
163         final DomNode uncle = getEquivalentLogical(parent.getNextSibling(), false);
164         if (uncle != null) {
165             return uncle;
166         }
167 
168         return getFirstUncleNode(parent);
169     }
170 
171     /**
172      * Recursively find the logical node occupying the same position as this
173      * _actual_ node. It could be the same node, a different node, or null
174      * depending on filtering.
175      *
176      * @param n The actual node we are trying to find the "equivalent" of
177      * @param lookLeft If true, traverse the tree in the left direction. If
178      *          false, traverse the tree to the right.
179      * @return the logical node in the same position as n
180      */
181     private DomNode getEquivalentLogical(final DomNode n, final boolean lookLeft) {
182         // Base cases
183         if (n == null) {
184             return null;
185         }
186         if (isNodeVisible(n)) {
187             return n;
188         }
189 
190         // If a node is skipped, try getting one of its descendants
191         if (isNodeSkipped(n)) {
192             final DomNode child;
193             if (lookLeft) {
194                 child = getEquivalentLogical(n.getLastChild(), lookLeft);
195             }
196             else {
197                 child = getEquivalentLogical(n.getFirstChild(), lookLeft);
198             }
199 
200             if (child != null) {
201                 return child;
202             }
203         }
204 
205         // If this node is rejected or has no descendants that will work, go
206         // to its sibling.
207         return getSibling(n, lookLeft);
208     }
209 
210     /**
211      * Returns whether the node is visible by the TreeWalker.
212      */
213     private boolean isNodeVisible(final Node n) {
214         if (acceptNode(n) == NodeFilter.FILTER_ACCEPT) {
215             if (filter_ == null || filter_.acceptNode(n) == NodeFilter.FILTER_ACCEPT) {
216                 return expandEntityReferences_ || n.getParentNode() == null
217                         || n.getParentNode().getNodeType() != Node.ENTITY_REFERENCE_NODE;
218             }
219         }
220         return false;
221     }
222 
223     /**
224      * Test whether a specified node is visible in the logical view of a
225      * TreeWalker, based solely on the whatToShow constant.
226      *
227      * @param n The node to check to see if it should be shown or not
228      * @return a constant to determine whether the node is accepted, rejected,
229      *          or skipped.
230      */
231     private short acceptNode(final Node n) {
232         final int flag = getFlagForNode(n);
233 
234         if ((whatToShow_ & flag) != 0) {
235             return NodeFilter.FILTER_ACCEPT;
236         }
237         // Skip, don't reject.
238         return NodeFilter.FILTER_SKIP;
239     }
240 
241     /**
242      * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
243      *
244      * Given a {@link Node}, return the appropriate constant for whatToShow.
245      *
246      * @param node the node
247      * @return the whatToShow constant for the type of specified node
248      */
249     public static int getFlagForNode(final Node node) {
250         switch (node.getNodeType()) {
251             case Node.ELEMENT_NODE:
252                 return NodeFilter.SHOW_ELEMENT;
253             case Node.ATTRIBUTE_NODE:
254                 return NodeFilter.SHOW_ATTRIBUTE;
255             case Node.TEXT_NODE:
256                 return NodeFilter.SHOW_TEXT;
257             case Node.CDATA_SECTION_NODE:
258                 return NodeFilter.SHOW_CDATA_SECTION;
259             case Node.ENTITY_REFERENCE_NODE:
260                 return NodeFilter.SHOW_ENTITY_REFERENCE;
261             case Node.ENTITY_NODE:
262                 return NodeFilter.SHOW_ENTITY;
263             case Node.PROCESSING_INSTRUCTION_NODE:
264                 return NodeFilter.SHOW_PROCESSING_INSTRUCTION;
265             case Node.COMMENT_NODE:
266                 return NodeFilter.SHOW_COMMENT;
267             case Node.DOCUMENT_NODE:
268                 return NodeFilter.SHOW_DOCUMENT;
269             case Node.DOCUMENT_TYPE_NODE:
270                 return NodeFilter.SHOW_DOCUMENT_TYPE;
271             case Node.DOCUMENT_FRAGMENT_NODE:
272                 return NodeFilter.SHOW_DOCUMENT_FRAGMENT;
273             case Node.NOTATION_NODE:
274                 return NodeFilter.SHOW_NOTATION;
275             default:
276                 return 0;
277         }
278     }
279 
280     /* Returns whether the node is skipped by the TreeWalker. */
281     private boolean isNodeSkipped(final Node n) {
282         return !isNodeVisible(n) && !isNodeRejected(n);
283     }
284 
285     /* Returns whether the node is rejected by the TreeWalker. */
286     private boolean isNodeRejected(final Node n) {
287         if (acceptNode(n) == NodeFilter.FILTER_REJECT) {
288             return true;
289         }
290         if (filter_ != null && filter_.acceptNode(n) == NodeFilter.FILTER_REJECT) {
291             return true;
292         }
293         return !expandEntityReferences_ && n.getParentNode() != null
294                 && n.getParentNode().getNodeType() == Node.ENTITY_REFERENCE_NODE;
295     }
296 
297     // Helper method for getEquivalentLogical
298     private DomNode getSibling(final DomNode n, final boolean lookLeft) {
299         if (n == null) {
300             return null;
301         }
302 
303         if (isNodeVisible(n)) {
304             return null;
305         }
306 
307         final DomNode sibling;
308         if (lookLeft) {
309             sibling = n.getPreviousSibling();
310         }
311         else {
312             sibling = n.getNextSibling();
313         }
314 
315         if (sibling == null) {
316             // If this node has no logical siblings at or below it's "level", it might have one above
317             if (n == root_) {
318                 return null;
319             }
320             return getSibling(n.getParentNode(), lookLeft);
321 
322         }
323         return getEquivalentLogical(sibling, lookLeft);
324     }
325 
326     /**
327      * @see org.w3c.dom.traversal.TreeWalker#nextSibling()
328      * @return the next sibling node
329      */
330     public DomNode nextSibling() {
331         if (currentNode_ == root_) {
332             return null;
333         }
334 
335         final DomNode newNode = getEquivalentLogical(currentNode_.getNextSibling(), false);
336 
337         if (newNode != null) {
338             currentNode_ = newNode;
339         }
340 
341         return newNode;
342     }
343 
344     /**
345      * @see org.w3c.dom.traversal.TreeWalker#parentNode()
346      * @return the parent node
347      */
348     public DomNode parentNode() {
349         if (currentNode_ == root_) {
350             return null;
351         }
352 
353         DomNode newNode = currentNode_;
354 
355         do {
356             newNode = newNode.getParentNode();
357         }
358         while (newNode != null && !isNodeVisible(newNode) && newNode != root_);
359 
360         if (newNode == null || !isNodeVisible(newNode)) {
361             return null;
362         }
363         currentNode_ = newNode;
364         return newNode;
365     }
366 
367     /**
368      * @see org.w3c.dom.traversal.TreeWalker#previousSibling()
369      * @return the previous sibling node
370      */
371     public DomNode previousSibling() {
372         if (currentNode_ == root_) {
373             return null;
374         }
375 
376         final DomNode newNode = getEquivalentLogical(currentNode_.getPreviousSibling(), true);
377 
378         if (newNode != null) {
379             currentNode_ = newNode;
380         }
381 
382         return newNode;
383     }
384 
385     /**
386      * @see org.w3c.dom.traversal.TreeWalker#lastChild()
387      * @return the last child node
388      */
389     public DomNode lastChild() {
390         final DomNode newNode = getEquivalentLogical(currentNode_.getLastChild(), true);
391 
392         if (newNode != null) {
393             currentNode_ = newNode;
394         }
395 
396         return newNode;
397     }
398 
399     /**
400      * @see org.w3c.dom.traversal.TreeWalker#previousNode()
401      * @return the previous node
402      */
403     public DomNode previousNode() {
404         final DomNode newNode = getPreviousNode(currentNode_);
405 
406         if (newNode != null) {
407             currentNode_ = newNode;
408         }
409 
410         return newNode;
411     }
412 
413     /**
414      * Helper method to get the previous node in document order (preorder
415      * traversal) from the given node.
416      */
417     private DomNode getPreviousNode(final DomNode n) {
418         if (n == root_) {
419             return null;
420         }
421         final DomNode left = getEquivalentLogical(n.getPreviousSibling(), true);
422         if (left == null) {
423             final DomNode parent = n.getParentNode();
424             if (parent == null) {
425                 return null;
426             }
427             if (isNodeVisible(parent)) {
428                 return parent;
429             }
430         }
431 
432         DomNode follow = left;
433         if (follow != null) {
434             while (follow.hasChildNodes()) {
435                 final DomNode toFollow = getEquivalentLogical(follow.getLastChild(), true);
436                 if (toFollow == null) {
437                     break;
438                 }
439                 follow = toFollow;
440             }
441         }
442         return follow;
443     }
444 
445     /**
446      * @see org.w3c.dom.traversal.TreeWalker#firstChild()
447      * @return the first child node
448      */
449     public DomNode firstChild() {
450         final DomNode newNode = getEquivalentLogical(currentNode_.getFirstChild(), false);
451 
452         if (newNode != null) {
453             currentNode_ = newNode;
454         }
455 
456         return newNode;
457     }
458 }