1 /*
2 * Copyright (c) 2002-2026 Gargoyle Software Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 * https://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15 package org.htmlunit.util;
16
17 import static java.nio.charset.StandardCharsets.US_ASCII;
18 import static java.nio.charset.StandardCharsets.UTF_8;
19
20 import java.io.ByteArrayOutputStream;
21 import java.net.MalformedURLException;
22 import java.net.URI;
23 import java.net.URISyntaxException;
24 import java.net.URL;
25 import java.net.URLEncoder;
26 import java.net.URLStreamHandler;
27 import java.nio.charset.Charset;
28 import java.util.BitSet;
29 import java.util.Locale;
30 import java.util.Objects;
31
32 import org.htmlunit.WebAssert;
33 import org.htmlunit.protocol.AnyHandler;
34 import org.htmlunit.protocol.javascript.JavaScriptURLConnection;
35
36 /**
37 * URL utilities class that makes it easy to create new URLs based off of old URLs
38 * without having to assemble or parse them yourself.
39 *
40 * @author Daniel Gredler
41 * @author Martin Tamme
42 * @author Sudhan Moghe
43 * @author Marc Guillemot
44 * @author Ahmed Ashour
45 * @author Ronald Brill
46 * @author Joerg Werner
47 * @author Hartmut Arlt
48 */
49 public final class UrlUtils {
50
51 /** "about". */
52 public static final String ABOUT = "about";
53 /** "about:". */
54 public static final String ABOUT_SCHEME = ABOUT + ":";
55 /** "about:blank". */
56 public static final String ABOUT_BLANK = ABOUT_SCHEME + "blank";
57 /** URL for "about:blank". */
58 public static final URL URL_ABOUT_BLANK;
59
60 private static final URLStreamHandler JS_HANDLER;
61 private static final URLStreamHandler ABOUT_HANDLER;
62 private static final URLStreamHandler DATA_HANDLER;
63
64 private static final BitSet PATH_ALLOWED_CHARS = new BitSet(256);
65 private static final BitSet QUERY_ALLOWED_CHARS = new BitSet(256);
66 private static final BitSet ANCHOR_ALLOWED_CHARS = new BitSet(256);
67 private static final BitSet HASH_ALLOWED_CHARS = new BitSet(256);
68
69 /*
70 URI allowed char initialization; based on HttpClient 3.1's URI bit sets.
71 */
72 static {
73 // make sure the handlers are available first (before calling toUrlSafe())
74 JS_HANDLER = new org.htmlunit.protocol.javascript.Handler();
75 ABOUT_HANDLER = new org.htmlunit.protocol.about.Handler();
76 DATA_HANDLER = new org.htmlunit.protocol.data.Handler();
77
78 try {
79 URL_ABOUT_BLANK = new URL(null, ABOUT_BLANK, ABOUT_HANDLER);
80 }
81 catch (final MalformedURLException e) {
82 // should never happen
83 throw new RuntimeException(e);
84 }
85
86 final BitSet reserved = new BitSet(256);
87 reserved.set(';');
88 reserved.set('/');
89 reserved.set('?');
90 reserved.set(':');
91 reserved.set('@');
92 reserved.set('&');
93 reserved.set('=');
94 reserved.set('+');
95 reserved.set('$');
96 reserved.set(',');
97
98 final BitSet mark = new BitSet(256);
99 mark.set('-');
100 mark.set('_');
101 mark.set('.');
102 mark.set('!');
103 mark.set('~');
104 mark.set('*');
105 mark.set('\'');
106 mark.set('(');
107 mark.set(')');
108
109 final BitSet alpha = new BitSet(256);
110 for (int i = 'a'; i <= 'z'; i++) {
111 alpha.set(i);
112 }
113 for (int i = 'A'; i <= 'Z'; i++) {
114 alpha.set(i);
115 }
116
117 final BitSet digit = new BitSet(256);
118 for (int i = '0'; i <= '9'; i++) {
119 digit.set(i);
120 }
121
122 final BitSet alphanumeric = new BitSet(256);
123 alphanumeric.or(alpha);
124 alphanumeric.or(digit);
125
126 final BitSet unreserved = new BitSet(256);
127 unreserved.or(alphanumeric);
128 unreserved.or(mark);
129
130 final BitSet hex = new BitSet(256);
131 hex.or(digit);
132 for (int i = 'a'; i <= 'f'; i++) {
133 hex.set(i);
134 }
135 for (int i = 'A'; i <= 'F'; i++) {
136 hex.set(i);
137 }
138
139 final BitSet escaped = new BitSet(256);
140 escaped.set('%');
141 escaped.or(hex);
142
143 final BitSet uric = new BitSet(256);
144 uric.or(reserved);
145 uric.or(unreserved);
146 uric.or(escaped);
147
148 final BitSet pchar = new BitSet(256);
149 pchar.or(unreserved);
150 pchar.or(escaped);
151 pchar.set(':');
152 pchar.set('@');
153 pchar.set('&');
154 pchar.set('=');
155 pchar.set('+');
156 pchar.set('$');
157 pchar.set(',');
158
159 final BitSet segment = new BitSet(256);
160 segment.or(pchar);
161 segment.set(';');
162 segment.or(pchar);
163
164 final BitSet pathSegments = new BitSet(256);
165 pathSegments.set('/');
166 pathSegments.or(segment);
167
168 final BitSet absPath = new BitSet(256);
169 absPath.set('/');
170 absPath.or(pathSegments);
171
172 final BitSet allowedAbsPath = new BitSet(256);
173 allowedAbsPath.or(absPath);
174
175 final BitSet allowedFragment = new BitSet(256);
176 allowedFragment.or(uric);
177
178 final BitSet allowedQuery = new BitSet(256);
179 allowedQuery.or(uric);
180
181 final BitSet allowedHash = new BitSet(256);
182 allowedHash.or(uric);
183
184 PATH_ALLOWED_CHARS.or(allowedAbsPath);
185 QUERY_ALLOWED_CHARS.or(allowedQuery);
186 ANCHOR_ALLOWED_CHARS.or(allowedFragment);
187 HASH_ALLOWED_CHARS.or(allowedHash);
188 }
189
190 /**
191 * Disallow instantiation of this class.
192 */
193 private UrlUtils() {
194 // Empty.
195 }
196
197 /**
198 * <p>Constructs a URL instance based on the specified URL string, taking into account the fact that the
199 * specified URL string may represent an <code>"about:..."</code> URL, a <code>"javascript:..."</code> URL, or
200 * a <code>data:...</code> URL.</p>
201 *
202 * <p>The caller should be sure that URL strings passed to this method will parse correctly as URLs, as
203 * this method never expects to have to handle {@link MalformedURLException}s.</p>
204 *
205 * @param url the URL string to convert into a URL instance
206 * @return the constructed URL instance
207 */
208 public static URL toUrlSafe(final String url) {
209 try {
210 return toUrlUnsafe(url);
211 }
212 catch (final MalformedURLException e) {
213 // Should never happen.
214 throw new RuntimeException(e);
215 }
216 }
217
218 /**
219 * <p>Constructs a URL instance based on the specified URL string, taking into account the fact that the
220 * specified URL string may represent an <code>"about:..."</code> URL, a <code>"javascript:..."</code> URL, or
221 * a <code>data:...</code> URL.</p>
222 *
223 * <p>Unlike {@link #toUrlSafe(String)}, the caller need not be sure that URL strings passed to this
224 * method will parse correctly as URLs.</p>
225 *
226 * @param url the URL string to convert into a URL instance
227 * @return the constructed URL instance
228 * @throws MalformedURLException if the URL string cannot be converted to a URL instance
229 */
230 public static URL toUrlUnsafe(final String url) throws MalformedURLException {
231 WebAssert.notNull("url", url);
232
233 final String protocol = StringUtils.substringBefore(url, ":").toLowerCase(Locale.ROOT);
234
235 if (protocol.isEmpty() || UrlUtils.isNormalUrlProtocol(protocol)) {
236 final URL response = new URL(url);
237 if (response.getProtocol().startsWith("http")
238 && StringUtils.isEmptyOrNull(response.getHost())) {
239 throw new MalformedURLException("Missing host name in url: " + url);
240 }
241 return response;
242 }
243
244 if (JavaScriptURLConnection.JAVASCRIPT_PREFIX.equals(protocol + ":")) {
245 return new URL(null, url, JS_HANDLER);
246 }
247
248 if (ABOUT.equals(protocol)) {
249 if (ABOUT_BLANK.equalsIgnoreCase(url)) {
250 return URL_ABOUT_BLANK;
251 }
252 return new URL(null, url, ABOUT_HANDLER);
253 }
254
255 if ("data".equals(protocol)) {
256 return new URL(null, url, DATA_HANDLER);
257 }
258
259 return new URL(null, url, AnyHandler.INSTANCE);
260 }
261
262 /**
263 * <p>Encodes illegal characters in the specified URL's path, query string and anchor according to the URL
264 * encoding rules observed in real browsers.</p>
265 *
266 * <p>For example, this method changes
267 * <code>"http://first/?a=b c"</code> to <code>"http://first/?a=b%20c"</code>.</p>
268 *
269 * @param url the URL to encode
270 * @param charset the charset
271 * @return the encoded URL
272 */
273 public static URL encodeUrl(final URL url, final Charset charset) {
274 if (!isNormalUrlProtocol(url.getProtocol())) {
275 return url; // javascript:, about:, data: and anything not supported like foo:
276 }
277
278 try {
279 String path = url.getPath();
280 if (path != null) {
281 path = encode(path, PATH_ALLOWED_CHARS, UTF_8);
282 }
283 String query = url.getQuery();
284 if (query != null) {
285 query = encode(query, QUERY_ALLOWED_CHARS, charset);
286 }
287 String anchor = url.getRef();
288 if (anchor != null) {
289 anchor = encode(anchor, ANCHOR_ALLOWED_CHARS, UTF_8);
290 }
291 return createNewUrl(url.getProtocol(), url.getUserInfo(), url.getHost(),
292 url.getPort(), path, anchor, query);
293 }
294 catch (final MalformedURLException e) {
295 // Impossible... I think.
296 throw new RuntimeException(e);
297 }
298 }
299
300 /**
301 * Encodes and escapes the specified URI anchor string.
302 *
303 * @param anchor the anchor string to encode and escape
304 * @return the encoded and escaped anchor string
305 */
306 public static String encodeAnchor(final String anchor) {
307 if (anchor == null) {
308 return null;
309 }
310 return encode(anchor, ANCHOR_ALLOWED_CHARS, UTF_8);
311 }
312
313 /**
314 * Encodes and escapes the specified URI hash string.
315 *
316 * @param hash the anchor string to encode and escape
317 * @return the encoded and escaped hash string
318 */
319 public static String encodeHash(final String hash) {
320 if (hash == null) {
321 return null;
322 }
323 return encode(hash, HASH_ALLOWED_CHARS, UTF_8);
324 }
325
326 /**
327 * Encodes and escapes the specified URI hash string.
328 *
329 * @param query the query string to encode and escape
330 * @return the encoded and escaped hash string
331 */
332 public static String encodeQuery(final String query) {
333 if (query == null) {
334 return null;
335 }
336 return encode(query, QUERY_ALLOWED_CHARS, UTF_8);
337 }
338
339 /**
340 * Unescapes and decodes the specified string.
341 *
342 * @param escaped the string to be unescaped and decoded
343 * @return the unescaped and decoded string
344 */
345 public static String decode(final String escaped) {
346 try {
347 final byte[] bytes = escaped.getBytes(US_ASCII);
348 final byte[] bytes2 = decodeUrl(bytes);
349 return new String(bytes2, UTF_8);
350 }
351 catch (final IllegalArgumentException e) {
352 // Should never happen.
353 throw new RuntimeException(e);
354 }
355 }
356
357 /**
358 * Escapes and encodes the specified string. Based on HttpClient 3.1's <code>URIUtil.encode()</code> method.
359 *
360 * @param unescaped the string to encode
361 * @param allowed allowed characters that shouldn't be escaped
362 * @param charset the charset to use
363 * @return the escaped string
364 */
365 private static String encode(final String unescaped, final BitSet allowed, final Charset charset) {
366 final byte[] bytes = unescaped.getBytes(charset);
367 final byte[] bytes2 = encodeUrl(allowed, bytes);
368 return encodePercentSign(bytes2);
369 }
370
371 /**
372 * Encodes every occurrence of the escape character '%' in the given input
373 * string that is not followed by two hexadecimal characters.
374 * @param input the input bytes
375 * @return the given input string where every occurrence of <code>%</code> in
376 * invalid escape sequences has been replace by <code>%25</code>
377 */
378 private static String encodePercentSign(final byte[] input) {
379 if (input == null) {
380 return null;
381 }
382
383 final StringBuilder result = new StringBuilder(new String(input, US_ASCII));
384 int state = -0;
385 int offset = 0;
386 for (int i = 0; i < input.length; i++) {
387 final byte b = input[i];
388 if (state == 0 && b == '%') {
389 state = 1;
390 }
391 else if (state == 1 || state == 2) {
392 if (('0' <= b && b <= '9')
393 || ('A' <= b && b <= 'F')
394 || ('a' <= b && b <= 'f')) {
395 state++;
396 if (state == 3) {
397 state = 0;
398 }
399 }
400 else {
401 final int st = i - state + offset;
402 result.replace(st, st + 1, "%25");
403 offset = offset + 2;
404 state = b == '%' ? 1 : 0;
405 }
406 }
407 }
408 if (state == 1 || state == 2) {
409 final int st = input.length - state + offset;
410 result.replace(st, st + 1, "%25");
411 }
412 return result.toString();
413 }
414
415 /**
416 * Creates and returns a new URL using only the protocol and authority from the given one.
417 * @param u the URL on which to base the returned URL
418 * @return a new URL using only the protocol and authority from the given one
419 * @throws MalformedURLException if there is a problem creating the new URL
420 */
421 public static URL getUrlWithoutPathRefQuery(final URL u) throws MalformedURLException {
422 return createNewUrl(u.getProtocol(), u.getAuthority(), null, null, null);
423 }
424
425 /**
426 * Creates and returns a new URL using only the protocol, authority and path
427 * from the given one.
428 * @param u the URL on which to base the returned URL
429 * @return a new URL using only the protocol and authority from the given one
430 * @throws MalformedURLException if there is a problem creating the new URL
431 */
432 public static URL getUrlWithoutRef(final URL u) throws MalformedURLException {
433 return createNewUrl(u.getProtocol(), u.getAuthority(), u.getPath(), null, u.getQuery());
434 }
435
436 /**
437 * Creates and returns a new URL identical to the specified URL, except using the specified protocol.
438 * @param u the URL on which to base the returned URL
439 * @param newProtocol the new protocol to use in the returned URL
440 * @return a new URL identical to the specified URL, except using the specified protocol
441 * @throws MalformedURLException if there is a problem creating the new URL
442 */
443 public static URL getUrlWithNewProtocol(final URL u, final String newProtocol) throws MalformedURLException {
444 return createNewUrl(newProtocol, u.getAuthority(), u.getPath(), u.getRef(), u.getQuery());
445 }
446
447 /**
448 * Creates and returns a new URL identical to the specified URL, except using the specified host.
449 * @param u the URL on which to base the returned URL
450 * @param newHost the new host to use in the returned URL
451 * @return a new URL identical to the specified URL, except using the specified host
452 * @throws MalformedURLException if there is a problem creating the new URL
453 */
454 public static URL getUrlWithNewHost(final URL u, final String newHost)
455 throws MalformedURLException {
456 return createNewUrl(u.getProtocol(), u.getUserInfo(), newHost,
457 u.getPort(), u.getPath(), u.getRef(), u.getQuery());
458 }
459
460 /**
461 * Creates and returns a new URL identical to the specified URL, except using the specified host.
462 * @param u the URL on which to base the returned URL
463 * @param newHost the new host to use in the returned URL
464 * @param newPort the new port to use in the returned URL
465 * @return a new URL identical to the specified URL, except using the specified host
466 * @throws MalformedURLException if there is a problem creating the new URL
467 */
468 public static URL getUrlWithNewHostAndPort(final URL u, final String newHost, final int newPort)
469 throws MalformedURLException {
470 return createNewUrl(u.getProtocol(), u.getUserInfo(), newHost, newPort, u.getPath(), u.getRef(), u.getQuery());
471 }
472
473 /**
474 * Creates and returns a new URL identical to the specified URL, except using the specified port.
475 * @param u the URL on which to base the returned URL
476 * @param newPort the new port to use in the returned URL or -1 to remove it
477 * @return a new URL identical to the specified URL, except using the specified port
478 * @throws MalformedURLException if there is a problem creating the new URL
479 */
480 public static URL getUrlWithNewPort(final URL u, final int newPort) throws MalformedURLException {
481 return createNewUrl(u.getProtocol(), u.getUserInfo(), u.getHost(),
482 newPort, u.getPath(), u.getRef(), u.getQuery());
483 }
484
485 /**
486 * Creates and returns a new URL identical to the specified URL, except using the specified path.
487 * @param u the URL on which to base the returned URL
488 * @param newPath the new path to use in the returned URL
489 * @return a new URL identical to the specified URL, except using the specified path
490 * @throws MalformedURLException if there is a problem creating the new URL
491 */
492 public static URL getUrlWithNewPath(final URL u, final String newPath) throws MalformedURLException {
493 return createNewUrl(u.getProtocol(), u.getAuthority(), newPath, u.getRef(), u.getQuery());
494 }
495
496 /**
497 * Creates and returns a new URL identical to the specified URL, except using the specified reference.
498 * @param u the URL on which to base the returned URL
499 * @param newRef the new reference to use in the returned URL or null to remove it
500 * @return a new URL identical to the specified URL, except using the specified reference
501 * @throws MalformedURLException if there is a problem creating the new URL
502 */
503 public static URL getUrlWithNewRef(final URL u, final String newRef) throws MalformedURLException {
504 return createNewUrl(u.getProtocol(), u.getAuthority(), u.getPath(), newRef, u.getQuery());
505 }
506
507 /**
508 * Creates and returns a new URL identical to the specified URL, except using the specified query string.
509 * @param u the URL on which to base the returned URL
510 * @param newQuery the new query string to use in the returned URL
511 * @return a new URL identical to the specified URL, except using the specified query string
512 * @throws MalformedURLException if there is a problem creating the new URL
513 */
514 public static URL getUrlWithNewQuery(final URL u, final String newQuery) throws MalformedURLException {
515 return createNewUrl(u.getProtocol(), u.getAuthority(), u.getPath(), u.getRef(), newQuery);
516 }
517
518 /**
519 * Creates and returns a new URL identical to the specified URL, ignoring path, protocol and query.
520 * @param u the URL on which to base the returned URL
521 * @return a new URL identical to the specified URL, ignoring path, protocol and query
522 * @throws MalformedURLException if there is a problem creating the new URL
523 */
524 public static URL getUrlWithProtocolAndAuthority(final URL u) throws MalformedURLException {
525 return createNewUrl(u.getProtocol(), u.getAuthority(), null, null, null);
526 }
527
528 /**
529 * Creates and returns a new URL identical to the specified URL but with a changed user name.
530 * @param u the URL on which to base the returned URL
531 * @param newUserName the new user name or null to remove it
532 * @return a new URL identical to the specified URL; only user name updated
533 * @throws MalformedURLException if there is a problem creating the new URL
534 */
535 public static URL getUrlWithNewUserName(final URL u, final String newUserName) throws MalformedURLException {
536 String newUserInfo = newUserName == null ? "" : newUserName;
537 final String userInfo = u.getUserInfo();
538 if (StringUtils.isNotBlank(userInfo)) {
539 final int colonIdx = userInfo.indexOf(':');
540 if (colonIdx > -1) {
541 newUserInfo = newUserInfo + userInfo.substring(colonIdx);
542 }
543 }
544 return createNewUrl(u.getProtocol(), newUserInfo.isEmpty() ? null : newUserInfo,
545 u.getHost(), u.getPort(), u.getPath(), u.getRef(), u.getQuery());
546 }
547
548 /**
549 * Creates and returns a new URL identical to the specified URL but with a changed user password.
550 * @param u the URL on which to base the returned URL
551 * @param newUserPassword the new user password or null to remove it
552 * @return a new URL identical to the specified URL; only user name updated
553 * @throws MalformedURLException if there is a problem creating the new URL
554 */
555 public static URL getUrlWithNewUserPassword(final URL u, final String newUserPassword)
556 throws MalformedURLException {
557 String newUserInfo = newUserPassword == null ? "" : ':' + newUserPassword;
558 final String userInfo = u.getUserInfo();
559 if (StringUtils.isNotBlank(userInfo)) {
560 final int colonIdx = userInfo.indexOf(':');
561 if (colonIdx > -1) {
562 newUserInfo = userInfo.substring(0, colonIdx) + newUserInfo;
563 }
564 else {
565 newUserInfo = userInfo + newUserInfo;
566 }
567 }
568 return createNewUrl(u.getProtocol(), newUserInfo.isEmpty() ? null : newUserInfo,
569 u.getHost(), u.getPort(), u.getPath(), u.getRef(), u.getQuery());
570 }
571
572 /**
573 * Creates a new URL based on the specified fragments.
574 * @param protocol the protocol to use (may not be {@code null})
575 * @param userInfo the user info to use (may be {@code null})
576 * @param host the host to use (may not be {@code null})
577 * @param port the port to use (may be <code>-1</code> if no port is specified)
578 * @param path the path to use (may be {@code null} and may omit the initial <code>'/'</code>)
579 * @param ref the reference to use (may be {@code null} and must not include the <code>'#'</code>)
580 * @param query the query to use (may be {@code null} and must not include the <code>'?'</code>)
581 * @return a new URL based on the specified fragments
582 * @throws MalformedURLException if there is a problem creating the new URL
583 */
584 private static URL createNewUrl(final String protocol, final String userInfo, final String host, final int port,
585 final String path, final String ref, final String query) throws MalformedURLException {
586 final StringBuilder s = new StringBuilder();
587 s.append(protocol).append("://");
588 if (userInfo != null) {
589 s.append(userInfo).append('@');
590 }
591 s.append(host);
592 if (port != -1) {
593 s.append(':').append(port);
594 }
595 if (path != null && !path.isEmpty()) {
596 if ('/' != path.charAt(0)) {
597 s.append('/');
598 }
599 s.append(path);
600 }
601 if (query != null) {
602 s.append('?').append(query);
603 }
604 if (ref != null) {
605 if (ref.isEmpty() || ref.charAt(0) != '#') {
606 s.append('#');
607 }
608 s.append(ref);
609 }
610
611 return new URL(s.toString());
612 }
613
614 /**
615 * Creates a new URL based on the specified fragments.
616 * @param protocol the protocol to use (may not be {@code null})
617 * @param authority the authority to use (may not be {@code null})
618 * @param path the path to use (may be {@code null} and may omit the initial <code>'/'</code>)
619 * @param ref the reference to use (may be {@code null} and must not include the <code>'#'</code>)
620 * @param query the query to use (may be {@code null} and must not include the <code>'?'</code>)
621 * @return a new URL based on the specified fragments
622 * @throws MalformedURLException if there is a problem creating the new URL
623 */
624 private static URL createNewUrl(final String protocol, final String authority,
625 final String path, final String ref, final String query) throws MalformedURLException {
626
627 // pre-compute length of StringBuilder
628 int len = protocol.length() + 1;
629 if (authority != null && !authority.isEmpty()) {
630 len += 2 + authority.length();
631 }
632 if (path != null) {
633 len += path.length();
634 }
635 if (query != null) {
636 len += 1 + query.length();
637 }
638 if (ref != null) {
639 len += 1 + ref.length();
640 }
641
642 final StringBuilder s = new StringBuilder(len);
643 s.append(protocol).append(':');
644 if (authority != null && !authority.isEmpty()) {
645 s.append("//").append(authority);
646 }
647 if (path != null) {
648 s.append(path);
649 }
650 if (query != null) {
651 s.append('?').append(query);
652 }
653 if (ref != null) {
654 if (ref.isEmpty() || ref.charAt(0) != '#') {
655 s.append('#');
656 }
657 s.append(ref);
658 }
659
660 return toUrlSafe(s.toString());
661 }
662
663 /**
664 * Resolves a given relative URL against a base URL. See
665 * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>
666 * Section 4 for more details.
667 *
668 * @param baseUrl The base URL in which to resolve the specification.
669 * @param relativeUrl The relative URL to resolve against the base URL.
670 * @return the resolved specification.
671 */
672 public static String resolveUrl(final String baseUrl, final String relativeUrl) {
673 if (baseUrl == null) {
674 throw new IllegalArgumentException("Base URL must not be null");
675 }
676 if (relativeUrl == null) {
677 throw new IllegalArgumentException("Relative URL must not be null");
678 }
679 final Url url = resolveUrl(parseUrl(baseUrl), relativeUrl);
680
681 return url.toString();
682 }
683
684 /**
685 * Resolves a given relative URL against a base URL. See
686 * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>
687 * Section 4 for more details.
688 *
689 * @param baseUrl The base URL in which to resolve the specification.
690 * @param relativeUrl The relative URL to resolve against the base URL.
691 * @return the resolved specification.
692 */
693 public static String resolveUrl(final URL baseUrl, final String relativeUrl) {
694 if (baseUrl == null) {
695 throw new IllegalArgumentException("Base URL must not be null");
696 }
697 return resolveUrl(baseUrl.toExternalForm(), relativeUrl);
698 }
699
700 /**
701 * Parses a given specification using the algorithm depicted in
702 * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
703 * <p>
704 * Section 2.4: Parsing a URL
705 * <p>
706 * An accepted method for parsing URLs is useful to clarify the
707 * generic-RL syntax of Section 2.2 and to describe the algorithm for
708 * resolving relative URLs presented in Section 4. This section
709 * describes the parsing rules for breaking down a URL (relative or
710 * absolute) into the component parts described in Section 2.1. The
711 * rules assume that the URL has already been separated from any
712 * surrounding text and copied to a "parse string". The rules are
713 * listed in the order in which they would be applied by the parser.
714 *
715 * @param spec The specification to parse.
716 * @return the parsed specification.
717 */
718 private static Url parseUrl(String spec) {
719 final Url url = new Url();
720 int startIndex = 0;
721 int endIndex = spec.length();
722
723 // see https://url.spec.whatwg.org/#concept-basic-url-parser
724 // * If input contains any leading or trailing C0 control or space, validation error.
725 // Remove any leading and trailing C0 control or space from input.
726 // * If input contains any ASCII tab or newline, validation error.
727 // Remove all ASCII tab or newline from input.
728
729 if (endIndex > startIndex) {
730 StringBuilder sb = null;
731 boolean before = true;
732 int trailing = 0;
733
734 for (int i = 0; i < endIndex; i++) {
735 final char c = spec.charAt(i);
736 boolean remove = false;
737
738 if (c == '\t' | c == '\r' | c == '\n') {
739 remove = true;
740 }
741 else if ('\u0000' <= c && c <= '\u0020') {
742 if (before) {
743 remove = true;
744 }
745 else {
746 trailing++;
747 }
748 }
749 else {
750 before = false;
751 trailing = 0;
752 }
753
754 if (remove) {
755 if (sb == null) {
756 sb = new StringBuilder(spec.substring(0, i));
757 }
758 }
759 else if (sb != null) {
760 sb.append(c);
761 }
762 }
763
764 if (sb == null) {
765 if (trailing > 0) {
766 endIndex = spec.length() - trailing;
767 spec = spec.substring(0, endIndex);
768 }
769 }
770 else {
771 if (trailing > 0) {
772 spec = sb.substring(0, sb.length() - trailing);
773 }
774 else {
775 spec = sb.toString();
776 }
777 endIndex = spec.length();
778 }
779 }
780
781 // Section 2.4.1: Parsing the Fragment Identifier
782 //
783 // If the parse string contains a crosshatch "#" character, then the
784 // substring after the first (left-most) crosshatch "#" and up to the
785 // end of the parse string is the <fragment> identifier. If the
786 // crosshatch is the last character, or no crosshatch is present, then
787 // the fragment identifier is empty. The matched substring, including
788 // the crosshatch character, is removed from the parse string before
789 // continuing.
790 //
791 // Note that the fragment identifier is not considered part of the URL.
792 // However, since it is often attached to the URL, parsers must be able
793 // to recognize and set aside fragment identifiers as part of the
794 // process.
795 final int crosshatchIndex = StringUtils.indexOf(spec, '#', startIndex, endIndex);
796
797 if (crosshatchIndex >= 0) {
798 url.fragment_ = spec.substring(crosshatchIndex + 1, endIndex);
799 endIndex = crosshatchIndex;
800 }
801 // Section 2.4.2: Parsing the Scheme
802 //
803 // If the parse string contains a colon ":" after the first character
804 // and before any characters not allowed as part of a scheme name (i.e.,
805 // any not an alphanumeric, plus "+", period ".", or hyphen "-"), the
806 // <scheme> of the URL is the substring of characters up to but not
807 // including the first colon. These characters and the colon are then
808 // removed from the parse string before continuing.
809 final int colonIndex = StringUtils.indexOf(spec, ':', startIndex, endIndex);
810
811 if (colonIndex > 0) {
812 final String scheme = spec.substring(startIndex, colonIndex);
813 if (isValidScheme(scheme)) {
814 url.scheme_ = scheme;
815 startIndex = colonIndex + 1;
816 }
817 }
818 // Section 2.4.3: Parsing the Network Location/Login
819 //
820 // If the parse string begins with a double-slash "//", then the
821 // substring of characters after the double-slash and up to, but not
822 // including, the next slash "/" character is the network location/login
823 // (<net_loc>) of the URL. If no trailing slash "/" is present, the
824 // entire remaining parse string is assigned to <net_loc>. The double-
825 // slash and <net_loc> are removed from the parse string before
826 // continuing.
827 //
828 // Note: We also accept a question mark "?" or a semicolon ";" character as
829 // delimiters for the network location/login (<net_loc>) of the URL.
830 final int locationStartIndex;
831 int locationEndIndex;
832
833 if (spec.startsWith("//", startIndex)) {
834 locationStartIndex = startIndex + 2;
835 locationEndIndex = StringUtils.indexOf(spec, '/', locationStartIndex, endIndex);
836 if (locationEndIndex >= 0) {
837 startIndex = locationEndIndex;
838 }
839 }
840 else {
841 locationStartIndex = -1;
842 locationEndIndex = -1;
843 }
844 // Section 2.4.4: Parsing the Query Information
845 //
846 // If the parse string contains a question mark "?" character, then the
847 // substring after the first (left-most) question mark "?" and up to the
848 // end of the parse string is the <query> information. If the question
849 // mark is the last character, or no question mark is present, then the
850 // query information is empty. The matched substring, including the
851 // question mark character, is removed from the parse string before
852 // continuing.
853 final int questionMarkIndex = StringUtils.indexOf(spec, '?', startIndex, endIndex);
854
855 if (questionMarkIndex >= 0) {
856 if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
857 // The substring of characters after the double-slash and up to, but not
858 // including, the question mark "?" character is the network location/login
859 // (<net_loc>) of the URL.
860 locationEndIndex = questionMarkIndex;
861 startIndex = questionMarkIndex;
862 }
863 url.query_ = spec.substring(questionMarkIndex + 1, endIndex);
864 endIndex = questionMarkIndex;
865 }
866 // Section 2.4.5: Parsing the Parameters
867 //
868 // If the parse string contains a semicolon ";" character, then the
869 // substring after the first (left-most) semicolon ";" and up to the end
870 // of the parse string is the parameters (<params>). If the semicolon
871 // is the last character, or no semicolon is present, then <params> is
872 // empty. The matched substring, including the semicolon character, is
873 // removed from the parse string before continuing.
874 final int semicolonIndex = StringUtils.indexOf(spec, ';', startIndex, endIndex);
875
876 if (semicolonIndex >= 0) {
877 if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
878 // The substring of characters after the double-slash and up to, but not
879 // including, the semicolon ";" character is the network location/login
880 // (<net_loc>) of the URL.
881 locationEndIndex = semicolonIndex;
882 startIndex = semicolonIndex;
883 }
884 url.parameters_ = spec.substring(semicolonIndex + 1, endIndex);
885 endIndex = semicolonIndex;
886 }
887 // Section 2.4.6: Parsing the Path
888 //
889 // After the above steps, all that is left of the parse string is the
890 // URL <path> and the slash "/" that may precede it. Even though the
891 // initial slash is not part of the URL path, the parser must remember
892 // whether or not it was present so that later processes can
893 // differentiate between relative and absolute paths. Often this is
894 // done by simply storing the preceding slash along with the path.
895 if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
896 // The entire remaining parse string is assigned to the network
897 // location/login (<net_loc>) of the URL.
898 locationEndIndex = endIndex;
899 }
900 else if (startIndex < endIndex) {
901 url.path_ = spec.substring(startIndex, endIndex);
902 }
903 // Set the network location/login (<net_loc>) of the URL.
904 if ((locationStartIndex >= 0) && (locationEndIndex >= 0)) {
905 url.location_ = spec.substring(locationStartIndex, locationEndIndex);
906 }
907 return url;
908 }
909
910 /**
911 * Returns true if specified string is a valid scheme name.
912 * <p>
913 * https://tools.ietf.org/html/rfc1738
914 * <p>
915 * Scheme names consist of a sequence of characters. The lower case
916 * letters "a"--"z", digits, and the characters plus ("+"), period
917 * ("."), and hyphen ("-") are allowed. For resiliency, programs
918 * interpreting URLs should treat upper case letters as equivalent to
919 * lower case in scheme names (e.g., allow "HTTP" as well as "http").
920 *
921 * @param scheme the scheme string to check
922 * @return true if valid
923 */
924 public static boolean isValidScheme(final String scheme) {
925 final int length = scheme.length();
926 if (length < 1) {
927 return false;
928 }
929
930 char c = scheme.charAt(0);
931 boolean isValid = ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
932 if (!isValid) {
933 return false;
934 }
935
936 for (int i = 1; i < length; i++) {
937 c = scheme.charAt(i);
938 isValid =
939 ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')
940 || ('0' <= c && c <= '9')
941 || c == '+'
942 || c == '.'
943 || c == '-';
944 if (!isValid) {
945 return false;
946 }
947 }
948
949 return true;
950 }
951
952 /**
953 * Returns true if specified string is a special scheme.
954 * see <a href='https://url.spec.whatwg.org/#special-scheme'>
955 * https://url.spec.whatwg.org/#special-scheme</a>
956 *
957 * @param scheme the scheme string to check
958 * @return true if special
959 */
960 public static boolean isSpecialScheme(final String scheme) {
961 final int length = scheme.length();
962 if (length < 2 || length > 5) {
963 return false;
964 }
965
966 final String schemeLC = scheme.toLowerCase(Locale.ROOT);
967 return "ftp".equals(schemeLC)
968 || "file".equals(schemeLC)
969 || "http".equals(schemeLC)
970 || "https".equals(schemeLC)
971 || "ws".equals(schemeLC)
972 || "wss".equals(schemeLC);
973 }
974
975 /**
976 * Resolves a given relative URL against a base URL using the algorithm
977 * depicted in <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
978 * <p>
979 * Section 4: Resolving Relative URLs
980 * <p>
981 * This section describes an example algorithm for resolving URLs within
982 * a context in which the URLs may be relative, such that the result is
983 * always a URL in absolute form. Although this algorithm cannot
984 * guarantee that the resulting URL will equal that intended by the
985 * original author, it does guarantee that any valid URL (relative or
986 * absolute) can be consistently transformed to an absolute form given a
987 * valid base URL.
988 *
989 * @param baseUrl The base URL in which to resolve the specification.
990 * @param relativeUrl The relative URL to resolve against the base URL.
991 * @return the resolved specification.
992 */
993 private static Url resolveUrl(final Url baseUrl, final String relativeUrl) {
994 final Url url = parseUrl(relativeUrl);
995 // Step 1: The base URL is established according to the rules of
996 // Section 3. If the base URL is the empty string (unknown),
997 // the embedded URL is interpreted as an absolute URL and
998 // we are done.
999 if (baseUrl == null) {
1000 return url;
1001 }
1002 // Step 2: Both the base and embedded URLs are parsed into their
1003 // component parts as described in Section 2.4.
1004 // a) If the embedded URL is entirely empty, it inherits the
1005 // entire base URL (i.e., is set equal to the base URL)
1006 // and we are done.
1007 if (relativeUrl.isEmpty()) {
1008 return new Url(baseUrl);
1009 }
1010 // b) If the embedded URL starts with a scheme name, it is
1011 // interpreted as an absolute URL and we are done.
1012 if (url.scheme_ != null) {
1013 return url;
1014 }
1015 // c) Otherwise, the embedded URL inherits the scheme of
1016 // the base URL.
1017 url.scheme_ = baseUrl.scheme_;
1018 // Step 3: If the embedded URL's <net_loc> is non-empty, we skip to
1019 // Step 7. Otherwise, the embedded URL inherits the <net_loc>
1020 // (if any) of the base URL.
1021 if (url.location_ != null) {
1022 return url;
1023 }
1024 url.location_ = baseUrl.location_;
1025 // Step 4: If the embedded URL path is preceded by a slash "/", the
1026 // path is not relative and we skip to Step 7.
1027 if (url.path_ != null && !url.path_.isEmpty() && url.path_.charAt(0) == '/') {
1028 url.path_ = removeLeadingSlashPoints(url.path_);
1029 return url;
1030 }
1031 // Step 5: If the embedded URL path is empty (and not preceded by a
1032 // slash), then the embedded URL inherits the base URL path,
1033 // and
1034 if (url.path_ == null) {
1035 url.path_ = baseUrl.path_;
1036 // a) if the embedded URL's <params> is non-empty, we skip to
1037 // step 7; otherwise, it inherits the <params> of the base
1038 // URL (if any) and
1039 if (url.parameters_ != null) {
1040 return url;
1041 }
1042 url.parameters_ = baseUrl.parameters_;
1043 // b) if the embedded URL's <query> is non-empty, we skip to
1044 // step 7; otherwise, it inherits the <query> of the base
1045 // URL (if any) and we skip to step 7.
1046 if (url.query_ != null) {
1047 return url;
1048 }
1049 url.query_ = baseUrl.query_;
1050 return url;
1051 }
1052 // Step 6: The last segment of the base URL's path (anything
1053 // following the rightmost slash "/", or the entire path if no
1054 // slash is present) is removed and the embedded URL's path is
1055 // appended in its place. The following operations are
1056 // then applied, in order, to the new path:
1057 final String basePath = baseUrl.path_;
1058 String path = "";
1059
1060 if (basePath == null) {
1061 path = "/";
1062 }
1063 else {
1064 final int lastSlashIndex = basePath.lastIndexOf('/');
1065
1066 if (lastSlashIndex >= 0) {
1067 path = basePath.substring(0, lastSlashIndex + 1);
1068 }
1069 }
1070
1071 path = path.concat(url.path_);
1072 // a) All occurrences of "./", where "." is a complete path
1073 // segment, are removed.
1074 int pathSegmentIndex;
1075
1076 while ((pathSegmentIndex = path.indexOf("/./")) >= 0) {
1077 path = path.substring(0, pathSegmentIndex + 1).concat(path.substring(pathSegmentIndex + 3));
1078 }
1079 // b) If the path ends with "." as a complete path segment,
1080 // that "." is removed.
1081 if (path.endsWith("/.")) {
1082 path = path.substring(0, path.length() - 1);
1083 }
1084 // c) All occurrences of "<segment>/../", where <segment> is a
1085 // complete path segment not equal to "..", are removed.
1086 // Removal of these path segments is performed iteratively,
1087 // removing the leftmost matching pattern on each iteration,
1088 // until no matching pattern remains.
1089 while ((pathSegmentIndex = path.indexOf("/../")) > 0) {
1090 final String pathSegment = path.substring(0, pathSegmentIndex);
1091 final int slashIndex = pathSegment.lastIndexOf('/');
1092
1093 if (slashIndex >= 0) {
1094 if (!"..".equals(pathSegment.substring(slashIndex))) {
1095 path = path.substring(0, slashIndex + 1).concat(path.substring(pathSegmentIndex + 4));
1096 }
1097 }
1098 else {
1099 path = path.substring(pathSegmentIndex + 4);
1100 }
1101 }
1102 // d) If the path ends with "<segment>/..", where <segment> is a
1103 // complete path segment not equal to "..", that
1104 // "<segment>/.." is removed.
1105 if (path.endsWith("/..")) {
1106 final String pathSegment = path.substring(0, path.length() - 3);
1107 final int slashIndex = pathSegment.lastIndexOf('/');
1108
1109 if (slashIndex >= 0) {
1110 path = path.substring(0, slashIndex + 1);
1111 }
1112 }
1113
1114 path = removeLeadingSlashPoints(path);
1115
1116 url.path_ = path;
1117 // Step 7: The resulting URL components, including any inherited from
1118 // the base URL, are recombined to give the absolute form of
1119 // the embedded URL.
1120 return url;
1121 }
1122
1123 /**
1124 * "../" after the leading "/" should be removed as browsers do (not in RFC)
1125 */
1126 private static String removeLeadingSlashPoints(final String path) {
1127 int i = 1;
1128 while (path.startsWith("../", i)) {
1129 i = i + 3;
1130 }
1131
1132 if (i > 1) {
1133 return "/" + path.substring(i);
1134 }
1135
1136 return path;
1137 }
1138
1139 /**
1140 * Class <code>Url</code> represents a Uniform Resource Locator.
1141 */
1142 private static class Url {
1143
1144 private String scheme_;
1145 private String location_;
1146 private String path_;
1147 private String parameters_;
1148 private String query_;
1149 private String fragment_;
1150
1151 /**
1152 * Creates a <code>Url</code> object.
1153 */
1154 Url() {
1155 super();
1156 }
1157
1158 /**
1159 * Creates a <code>Url</code> object from the specified
1160 * <code>Url</code> object.
1161 *
1162 * @param url a <code>Url</code> object.
1163 */
1164 Url(final Url url) {
1165 scheme_ = url.scheme_;
1166 location_ = url.location_;
1167 path_ = url.path_;
1168 parameters_ = url.parameters_;
1169 query_ = url.query_;
1170 fragment_ = url.fragment_;
1171 }
1172
1173 /**
1174 * Returns a string representation of the <code>Url</code> object.
1175 *
1176 * @return a string representation of the <code>Url</code> object.
1177 */
1178 @Override
1179 public String toString() {
1180 final StringBuilder sb = new StringBuilder();
1181
1182 if (scheme_ != null) {
1183 sb.append(scheme_).append(':');
1184 }
1185 if (location_ != null) {
1186 sb.append("//").append(location_);
1187 }
1188 if (path_ != null) {
1189 sb.append(path_);
1190 }
1191 if (parameters_ != null) {
1192 sb.append(';').append(parameters_);
1193 }
1194 if (query_ != null) {
1195 sb.append('?').append(query_);
1196 }
1197 if (fragment_ != null) {
1198 sb.append('#').append(fragment_);
1199 }
1200 return sb.toString();
1201 }
1202 }
1203
1204 static boolean isNormalUrlProtocol(final String protocol) {
1205 return "http".equals(protocol) || "https".equals(protocol) || "file".equals(protocol);
1206 }
1207
1208 /**
1209 * More or less the same as sameFile(URL, URL) but without
1210 * resolving the host to an IP address for comparing.
1211 * Additionally we do some path normalization.
1212 *
1213 * @param u1 a URL object
1214 * @param u2 a URL object
1215 * @return true if u1 and u2 refer to the same file
1216 */
1217 public static boolean sameFile(final URL u1, final URL u2) {
1218 if (u1 == u2) {
1219 return true;
1220 }
1221 if (u1 == null || u2 == null) {
1222 return false;
1223 }
1224
1225 // Compare the protocols.
1226 final String p1 = u1.getProtocol();
1227 final String p2 = u2.getProtocol();
1228 if (!(p1 == p2 || (p1 != null && p1.equalsIgnoreCase(p2)))) {
1229 return false;
1230 }
1231
1232 // Compare the ports.
1233 final int port1 = (u1.getPort() == -1) ? u1.getDefaultPort() : u1.getPort();
1234 final int port2 = (u2.getPort() == -1) ? u2.getDefaultPort() : u2.getPort();
1235 if (port1 != port2) {
1236 return false;
1237 }
1238
1239 // Compare the hosts.
1240 final String h1 = u1.getHost();
1241 final String h2 = u2.getHost();
1242 if (!(h1 == h2 || (h1 != null && h1.equalsIgnoreCase(h2)))) {
1243 return false;
1244 }
1245
1246 // Compare the files.
1247 String f1 = u1.getFile();
1248 if (f1.isEmpty()) {
1249 f1 = "/";
1250 }
1251 String f2 = u2.getFile();
1252 if (f2.isEmpty()) {
1253 f2 = "/";
1254 }
1255 if (f1.indexOf('.') > 0 || f2.indexOf('.') > 0) {
1256 try {
1257 f1 = u1.toURI().normalize().toURL().getFile();
1258 f2 = u2.toURI().normalize().toURL().getFile();
1259 }
1260 catch (final RuntimeException e) {
1261 throw e;
1262 }
1263 catch (final Exception ignored) {
1264 // ignore
1265 }
1266 }
1267
1268 return Objects.equals(f1, f2);
1269 }
1270
1271 /**
1272 * Helper that constructs a normalized url string
1273 * usable as cache key.
1274 *
1275 * @param url a URL object
1276 * @return the normalized string
1277 */
1278 public static String normalize(final URL url) {
1279 final StringBuilder result = new StringBuilder();
1280 result.append(url.getProtocol())
1281 .append("://")
1282 .append(url.getHost())
1283 .append(':')
1284 .append((url.getPort() == -1) ? url.getDefaultPort() : url.getPort());
1285
1286 // Compare the files.
1287 String f = url.getFile();
1288 if (f.isEmpty()) {
1289 result.append('/');
1290 }
1291 else {
1292 if (f.indexOf('.') > 0) {
1293 try {
1294 f = url.toURI().normalize().toURL().getFile();
1295 }
1296 catch (final Exception ignored) {
1297 // ignore
1298 }
1299 }
1300 result.append(f);
1301 }
1302
1303 return result.toString();
1304 }
1305
1306 /**
1307 * Constructs a {@link URI} using the specified URL.
1308 *
1309 * @param url the URL
1310 * @param query the query
1311 *
1312 * @throws URISyntaxException
1313 * If both a scheme and a path are given but the path is
1314 * relative, if the URI string constructed from the given
1315 * components violates RFC 2396, or if the authority
1316 * component of the string is present but cannot be parsed
1317 * as a server-based authority
1318 * @return the URI
1319 */
1320 public static URI toURI(final URL url, final String query) throws URISyntaxException {
1321 final String scheme = url.getProtocol();
1322 final String host = url.getHost();
1323 final int port = url.getPort();
1324 final String path = url.getPath();
1325 final StringBuilder buffer = new StringBuilder();
1326 if (host != null) {
1327 if (scheme != null) {
1328 buffer.append(scheme).append("://");
1329 }
1330 buffer.append(host);
1331 if (port > 0) {
1332 buffer.append(':').append(port);
1333 }
1334 }
1335 if (path == null || path.isEmpty() || path.charAt(0) != '/') {
1336 buffer.append('/');
1337 }
1338 if (path != null) {
1339 buffer.append(path);
1340 }
1341 if (query != null) {
1342 buffer.append('?').append(query);
1343 }
1344 return new URI(buffer.toString());
1345 }
1346
1347 /**
1348 * @param part the part to encode
1349 * @return the ecoded string
1350 */
1351 public static String encodeQueryPart(final String part) {
1352 if (part == null || part.isEmpty()) {
1353 return "";
1354 }
1355
1356 return URLEncoder.encode(part, UTF_8);
1357 }
1358
1359 /**
1360 * Removes the well known ports if it can be deduced from protocol.
1361 * @param url the url to clean up
1362 * @return a new URL without the port or the given one
1363 * @throws MalformedURLException if the URL string cannot be converted to a URL instance
1364 */
1365 public static URL removeRedundantPort(final URL url) throws MalformedURLException {
1366 if (("https".equals(url.getProtocol()) && url.getPort() == 443)
1367 || ("http".equals(url.getProtocol()) && url.getPort() == 80)) {
1368 return getUrlWithNewPort(url, -1);
1369 }
1370 return url;
1371 }
1372
1373 /**
1374 * Decodes an array of URL safe 7-bit characters into an array of original bytes.
1375 * Escaped characters are converted back to their original representation.
1376 * @param bytes array of URL safe characters
1377 * @param removeWhitespace if true don't add whitespace chars to the output
1378 * @return array of original bytes
1379 * @throws IllegalArgumentException in case of error
1380 */
1381 public static byte[] decodeDataUrl(final byte[] bytes, final boolean removeWhitespace)
1382 throws IllegalArgumentException {
1383 // adapted from apache commons codec
1384 if (bytes == null) {
1385 return null;
1386 }
1387 final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
1388 for (int i = 0; i < bytes.length; i++) {
1389 int b = bytes[i];
1390 if (b == '%') {
1391 try {
1392 final int u = digit16(bytes[++i]);
1393 final int l = digit16(bytes[++i]);
1394 b = (u << 4) + l;
1395 }
1396 catch (final ArrayIndexOutOfBoundsException e) {
1397 throw new IllegalArgumentException("Invalid URL encoding: ", e);
1398 }
1399 }
1400 if (removeWhitespace
1401 && (b == 9 || b == 10 || b == 12 || b == 13 || b == 32)) {
1402 continue;
1403 }
1404
1405 buffer.write(b);
1406 }
1407 return buffer.toByteArray();
1408 }
1409
1410 /**
1411 * Decodes an array of URL safe 7-bit characters into an array of original bytes.
1412 * Escaped characters are converted back to their original representation.
1413 * @param bytes array of URL safe characters
1414 * @return array of original bytes
1415 * @throws IllegalArgumentException in case of error
1416 */
1417 public static byte[] decodeUrl(final byte[] bytes) throws IllegalArgumentException {
1418 // adapted from apache commons codec
1419 if (bytes == null) {
1420 return null;
1421 }
1422 final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
1423 for (int i = 0; i < bytes.length; i++) {
1424 final int b = bytes[i];
1425 if (b == '+') {
1426 buffer.write(' ');
1427 }
1428 else if (b == '%') {
1429 try {
1430 final int u = digit16(bytes[++i]);
1431 final int l = digit16(bytes[++i]);
1432 buffer.write((char) ((u << 4) + l));
1433 }
1434 catch (final ArrayIndexOutOfBoundsException e) {
1435 throw new IllegalArgumentException("Invalid URL encoding: ", e);
1436 }
1437 }
1438 else {
1439 buffer.write(b);
1440 }
1441 }
1442 return buffer.toByteArray();
1443 }
1444
1445 private static int digit16(final byte b) throws IllegalArgumentException {
1446 final int i = Character.digit((char) b, 16);
1447 if (i == -1) {
1448 throw new IllegalArgumentException("Invalid URL encoding: not a valid digit (radix 16): " + b);
1449 }
1450 return i;
1451 }
1452
1453 /**
1454 * Encodes an array of bytes into an array of URL safe 7-bit characters. Unsafe characters are escaped.
1455 * @param urlsafe bitset of characters deemed URL safe
1456 * @param bytes array of bytes to convert to URL safe characters
1457 * @return array of bytes containing URL safe characters
1458 */
1459 public static byte[] encodeUrl(final BitSet urlsafe, final byte[] bytes) {
1460 // adapted from apache commons codec
1461 if (bytes == null) {
1462 return null;
1463 }
1464
1465 final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
1466 for (final byte c : bytes) {
1467 int b = c;
1468 if (b < 0) {
1469 b = 256 + b;
1470 }
1471 if (urlsafe.get(b)) {
1472 if (b == ' ') {
1473 b = '+';
1474 }
1475 buffer.write(b);
1476 }
1477 else {
1478 buffer.write('%');
1479 final char hex1 = hexDigit(b >> 4);
1480 final char hex2 = hexDigit(b);
1481 buffer.write(hex1);
1482 buffer.write(hex2);
1483 }
1484 }
1485 return buffer.toByteArray();
1486 }
1487
1488 private static char hexDigit(final int b) {
1489 return Character.toUpperCase(Character.forDigit(b & 0xF, 16));
1490 }
1491
1492 /**
1493 * Determines whether two URLs share the same origin according to the Same-Origin Policy.
1494 * Two URLs are considered to have the same origin if they have the same protocol (scheme),
1495 * host, and port.
1496 *
1497 * <p>The method handles default ports correctly by using the URL's default port when
1498 * the explicit port is -1 (indicating no port was specified).
1499 *
1500 * @param originUrl the first URL to compare (must not be null)
1501 * @param newUrl the second URL to compare (must not be null)
1502 * @return {@code true} if both URLs have the same host and effective port; {@code false} otherwise
1503 */
1504 public static boolean isSameOrigin(final URL originUrl, final URL newUrl) {
1505 if (!originUrl.getProtocol().equals(newUrl.getProtocol())) {
1506 return false;
1507 }
1508
1509 if (!originUrl.getHost().equalsIgnoreCase(newUrl.getHost())) {
1510 return false;
1511 }
1512
1513 int originPort = originUrl.getPort();
1514 if (originPort == -1) {
1515 originPort = originUrl.getDefaultPort();
1516 }
1517 int newPort = newUrl.getPort();
1518 if (newPort == -1) {
1519 newPort = newUrl.getDefaultPort();
1520 }
1521 return originPort == newPort;
1522 }
1523 }