1 /*
2 * Copyright (c) 2002-2025 Gargoyle Software Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 * https://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15 package org.htmlunit;
16
17 import java.io.Serializable;
18 import java.net.URL;
19 import java.util.Collections;
20 import java.util.Date;
21 import java.util.HashMap;
22 import java.util.Map;
23 import java.util.regex.Matcher;
24 import java.util.regex.Pattern;
25
26 import org.htmlunit.cssparser.dom.CSSStyleSheetImpl;
27 import org.htmlunit.http.HttpUtils;
28 import org.htmlunit.util.HeaderUtils;
29 import org.htmlunit.util.UrlUtils;
30
31 /**
32 * <p>Simple cache implementation which caches compiled JavaScript files and parsed CSS snippets. Caching
33 * compiled JavaScript files avoids unnecessary web requests and additional compilation overhead, while
34 * caching parsed CSS snippets avoids very expensive CSS parsing.</p>
35 *
36 * @author Marc Guillemot
37 * @author Daniel Gredler
38 * @author Ahmed Ashour
39 * @author Anton Demydenko
40 * @author Ronald Brill
41 * @author Ashley Frieze
42 */
43 public class Cache implements Serializable {
44
45 /** The maximum size of the cache. */
46 private int maxSize_ = 40;
47
48 private static final Pattern DATE_HEADER_PATTERN = Pattern.compile("-?\\d+");
49 static final long DELAY = 10 * org.apache.commons.lang3.time.DateUtils.MILLIS_PER_MINUTE;
50
51 // for taking ten percent of a number in milliseconds and converting that to the amount in seconds
52 private static final double TEN_PERCENT_OF_MILLISECONDS_IN_SECONDS = 0.0001;
53
54 /**
55 * The map which holds the cached responses. Note that when keying on URLs, we key on the string version
56 * of the URLs, rather than on the URLs themselves. This is done for performance, because a) the
57 * {@link java.net.URL#hashCode()} method is synchronized, and b) the {@link java.net.URL#hashCode()}
58 * method triggers DNS lookups of the URL hostnames' IPs. As of this writing, the HtmlUnit unit tests
59 * run ~20% faster whey keying on strings rather than on {@link java.net.URL} instances.
60 */
61 private final Map<String, Entry> entries_ = Collections.synchronizedMap(new HashMap<>(maxSize_));
62
63 /**
64 * A cache entry.
65 */
66 private static class Entry implements Comparable<Entry>, Serializable {
67 private final String key_;
68 private final WebResponse response_;
69 private final Object value_;
70 private long lastAccess_;
71 private final long createdAt_;
72
73 Entry(final String key, final WebResponse response, final Object value) {
74 key_ = key;
75 response_ = response;
76 value_ = value;
77 createdAt_ = System.currentTimeMillis();
78 lastAccess_ = createdAt_;
79 }
80
81 /**
82 * {@inheritDoc}
83 */
84 @Override
85 public int compareTo(final Entry other) {
86 return Long.compare(lastAccess_, other.lastAccess_);
87 }
88
89 /**
90 * {@inheritDoc}
91 */
92 @Override
93 public boolean equals(final Object obj) {
94 return obj instanceof Entry && lastAccess_ == ((Entry) obj).lastAccess_;
95 }
96
97 /**
98 * {@inheritDoc}
99 */
100 @Override
101 public int hashCode() {
102 return ((Long) lastAccess_).hashCode();
103 }
104
105 /**
106 * Updates the last access date.
107 */
108 public void touch() {
109 lastAccess_ = System.currentTimeMillis();
110 }
111
112 /**
113 * Is this cached entry still fresh?
114 * @param now the current time
115 * @return <code>true</code> if can keep in the cache
116 * @see #isWithinCacheWindow(WebResponse, long, long)
117 */
118 boolean isStillFresh(final long now) {
119 return Cache.isWithinCacheWindow(response_, now, createdAt_);
120 }
121 }
122
123 /**
124 * <p>Find expiry time using
125 * a) s-maxage specified<br />
126 * b) max-age specified<br />
127 * c) expired specified<br />
128 * d) A Last-Update is specified and the time is now within 10% of the difference between download time and update
129 * time</p>
130 *
131 * @see <a href="https://datatracker.ietf.org/doc/html/rfc7234#section-4.2.2">RFC 7234</a>
132 *
133 * @param response {@link WebResponse}
134 * @param now the current time
135 * @param createdAt when the request was downloaded
136 * @return true if still fresh
137 */
138 static boolean isWithinCacheWindow(final WebResponse response, final long now, final long createdAt) {
139 long freshnessLifetime = 0;
140 if (!HeaderUtils.containsPrivate(response) && HeaderUtils.containsSMaxage(response)) {
141 // check s-maxage
142 freshnessLifetime = HeaderUtils.sMaxage(response);
143 }
144 else if (HeaderUtils.containsMaxAge(response)) {
145 // check max-age
146 freshnessLifetime = HeaderUtils.maxAge(response);
147 }
148 else if (response.getResponseHeaderValue(HttpHeader.EXPIRES) != null) {
149 final Date expires = parseDateHeader(response, HttpHeader.EXPIRES);
150 if (expires != null) {
151 // use the same logic as in isCacheableContent()
152 return expires.getTime() - now > DELAY;
153 }
154 }
155 else if (response.getResponseHeaderValue(HttpHeader.LAST_MODIFIED) != null) {
156 final Date lastModified = parseDateHeader(response, HttpHeader.LAST_MODIFIED);
157 if (lastModified != null) {
158 freshnessLifetime = (long) ((createdAt - lastModified.getTime())
159 * TEN_PERCENT_OF_MILLISECONDS_IN_SECONDS);
160 }
161 }
162 return now - createdAt < freshnessLifetime * org.apache.commons.lang3.time.DateUtils.MILLIS_PER_SECOND;
163 }
164
165 /**
166 * Caches the specified object, if the corresponding request and response objects indicate
167 * that it is cacheable.
168 *
169 * @param request the request corresponding to the specified compiled script
170 * @param response the response corresponding to the specified compiled script
171 * @param toCache the object that is to be cached, if possible (for instance a compiled script or
172 * simply a WebResponse)
173 * @return whether the response was cached or not
174 */
175 public boolean cacheIfPossible(final WebRequest request, final WebResponse response, final Object toCache) {
176 if (isCacheable(request, response)) {
177 final URL url = request.getUrl();
178 if (url == null) {
179 return false;
180 }
181
182 final Entry entry = new Entry(UrlUtils.normalize(url), response, toCache);
183 entries_.put(entry.key_, entry);
184 deleteOverflow();
185 return true;
186 }
187
188 return false;
189 }
190
191 /**
192 * Caches the parsed version of the specified CSS snippet. We key the cache based on CSS snippets (rather
193 * than requests and responses as is done above) because a) this allows us to cache inline CSS, b) CSS is
194 * extremely expensive to parse, so we want to avoid it as much as possible, c) CSS files aren't usually
195 * nearly as large as JavaScript files, so memory bloat won't be too bad, and d) caching on requests and
196 * responses requires checking dynamically (see {@link #isCacheableContent(WebResponse)}), and headers often
197 * aren't set up correctly, disallowing caching when in fact it should be allowed.
198 *
199 * @param css the CSS snippet from which <code>styleSheet</code> is derived
200 * @param styleSheet the parsed version of <code>css</code>
201 */
202 public void cache(final String css, final CSSStyleSheetImpl styleSheet) {
203 final Entry entry = new Entry(css, null, styleSheet);
204 entries_.put(entry.key_, entry);
205 deleteOverflow();
206 }
207
208 /**
209 * Truncates the cache to the maximal number of entries.
210 */
211 protected void deleteOverflow() {
212 synchronized (entries_) {
213 while (entries_.size() > maxSize_) {
214 final Entry oldestEntry = Collections.min(entries_.values());
215 entries_.remove(oldestEntry.key_);
216 if (oldestEntry.response_ != null) {
217 oldestEntry.response_.cleanUp();
218 }
219 }
220 }
221 }
222
223 /**
224 * Determines if the specified response can be cached.
225 *
226 * @param request the performed request
227 * @param response the received response
228 * @return {@code true} if the response can be cached
229 */
230 protected boolean isCacheable(final WebRequest request, final WebResponse response) {
231 return HttpMethod.GET == response.getWebRequest().getHttpMethod()
232 && UrlUtils.URL_ABOUT_BLANK != request.getUrl()
233 && isCacheableContent(response);
234 }
235
236 /**
237 * <p>Perform prior validation for 'no-store' directive in Cache-Control header.</p>
238 *
239 * <p>Tries to guess if the content is dynamic or not.</p>
240 *
241 * <p>"Since origin servers do not always provide explicit expiration times, HTTP caches typically
242 * assign heuristic expiration times, employing algorithms that use other header values (such as the
243 * <code>Last-Modified</code> time) to estimate a plausible expiration time".</p>
244 *
245 * <p>The current implementation considers as dynamic content everything except responses with a
246 * <code>Last-Modified</code> header with a date older than 10 minutes or with an <code>Expires</code> header
247 * specifying expiration in more than 10 minutes.</p>
248 *
249 * @see <a href="https://tools.ietf.org/html/rfc7234">RFC 7234</a>
250 * @see <a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html">RFC 2616</a>
251 * @param response the response to examine
252 * @return {@code true} if the response should be considered as cacheable
253 */
254 protected boolean isCacheableContent(final WebResponse response) {
255 if (HeaderUtils.containsNoStore(response)) {
256 return false;
257 }
258
259 final long now = getCurrentTimestamp();
260 return isWithinCacheWindow(response, now, now);
261 }
262
263 /**
264 * Gets the current time stamp. As method to allow overriding it, when simulating another time.
265 * @return the current time stamp
266 */
267 protected long getCurrentTimestamp() {
268 return System.currentTimeMillis();
269 }
270
271 /**
272 * Parses and returns the specified date header of the specified response. This method
273 * returns {@code null} if the specified header cannot be found or cannot be parsed as a date.
274 *
275 * @param response the response
276 * @param headerName the header name
277 * @return the specified date header of the specified response
278 */
279 protected static Date parseDateHeader(final WebResponse response, final String headerName) {
280 final String value = response.getResponseHeaderValue(headerName);
281 if (value == null) {
282 return null;
283 }
284 final Matcher matcher = DATE_HEADER_PATTERN.matcher(value);
285 if (matcher.matches()) {
286 return new Date();
287 }
288 return HttpUtils.parseDate(value);
289 }
290
291 /**
292 * Returns the cached response corresponding to the specified request. If there is
293 * no corresponding cached object, this method returns {@code null}.
294 *
295 * <p>Calculates and check if object still fresh(RFC 7234) otherwise returns {@code null}.</p>
296 * @see <a href="https://tools.ietf.org/html/rfc7234">RFC 7234</a>
297 *
298 * @param request the request whose corresponding response is sought
299 * @return the cached response corresponding to the specified request if any
300 */
301 public WebResponse getCachedResponse(final WebRequest request) {
302 final Entry cachedEntry = getCacheEntry(request);
303 if (cachedEntry == null) {
304 return null;
305 }
306 return cachedEntry.response_;
307 }
308
309 /**
310 * Returns the cached object corresponding to the specified request. If there is
311 * no corresponding cached object, this method returns {@code null}.
312 *
313 * <p>Calculates and check if object still fresh(RFC 7234) otherwise returns {@code null}.</p>
314 * @see <a href="https://tools.ietf.org/html/rfc7234">RFC 7234</a>
315 *
316 * @param request the request whose corresponding cached compiled script is sought
317 * @return the cached object corresponding to the specified request if any
318 */
319 public Object getCachedObject(final WebRequest request) {
320 final Entry cachedEntry = getCacheEntry(request);
321 if (cachedEntry == null) {
322 return null;
323 }
324 return cachedEntry.value_;
325 }
326
327 private Entry getCacheEntry(final WebRequest request) {
328 if (HttpMethod.GET != request.getHttpMethod()) {
329 return null;
330 }
331
332 final URL url = request.getUrl();
333 if (url == null) {
334 return null;
335 }
336
337 final String normalizedUrl = UrlUtils.normalize(url);
338 final Entry cachedEntry = entries_.get(normalizedUrl);
339 if (cachedEntry == null) {
340 return null;
341 }
342
343 if (cachedEntry.isStillFresh(getCurrentTimestamp())) {
344 synchronized (entries_) {
345 cachedEntry.touch();
346 }
347 return cachedEntry;
348 }
349 entries_.remove(UrlUtils.normalize(url));
350 return null;
351 }
352
353 /**
354 * Returns the cached parsed version of the specified CSS snippet. If there is no
355 * corresponding cached stylesheet, this method returns {@code null}.
356 *
357 * @param css the CSS snippet whose cached stylesheet is sought
358 * @return the cached stylesheet corresponding to the specified CSS snippet
359 */
360 public CSSStyleSheetImpl getCachedStyleSheet(final String css) {
361 final Entry cachedEntry = entries_.get(css);
362 if (cachedEntry == null) {
363 return null;
364 }
365 synchronized (entries_) {
366 cachedEntry.touch();
367 }
368 return (CSSStyleSheetImpl) cachedEntry.value_;
369 }
370
371 /**
372 * Returns the cache's maximum size. This is the maximum number of files that will
373 * be cached. The default is <code>25</code>.
374 *
375 * @return the cache's maximum size
376 */
377 public int getMaxSize() {
378 return maxSize_;
379 }
380
381 /**
382 * Sets the cache's maximum size. This is the maximum number of files that will
383 * be cached. The default is <code>25</code>.
384 *
385 * @param maxSize the cache's maximum size (must be >= 0)
386 */
387 public void setMaxSize(final int maxSize) {
388 if (maxSize < 0) {
389 throw new IllegalArgumentException("Illegal value for maxSize: " + maxSize);
390 }
391 maxSize_ = maxSize;
392 deleteOverflow();
393 }
394
395 /**
396 * Returns the number of entries in the cache.
397 *
398 * @return the number of entries in the cache
399 */
400 public int getSize() {
401 return entries_.size();
402 }
403
404 /**
405 * Clears the cache.
406 */
407 public void clear() {
408 synchronized (entries_) {
409 for (final Entry entry : entries_.values()) {
410 if (entry.response_ != null) {
411 entry.response_.cleanUp();
412 }
413 }
414 entries_.clear();
415 }
416 }
417
418 /**
419 * Removes outdated entries from the cache.
420 */
421 public void clearOutdated() {
422 synchronized (entries_) {
423 final long now = getCurrentTimestamp();
424
425 entries_.entrySet().removeIf(entry -> entry.getValue().response_ == null
426 || !entry.getValue().isStillFresh(now));
427 }
428 }
429 }