1 /*
2 * Copyright (c) 2002-2025 Gargoyle Software Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 * https://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15 package org.htmlunit.util;
16
17 import static java.nio.charset.StandardCharsets.US_ASCII;
18 import static java.nio.charset.StandardCharsets.UTF_8;
19
20 import java.io.ByteArrayOutputStream;
21 import java.io.UnsupportedEncodingException;
22 import java.net.MalformedURLException;
23 import java.net.URI;
24 import java.net.URISyntaxException;
25 import java.net.URL;
26 import java.net.URLEncoder;
27 import java.net.URLStreamHandler;
28 import java.nio.charset.Charset;
29 import java.util.BitSet;
30 import java.util.Locale;
31 import java.util.Objects;
32
33 import org.htmlunit.WebAssert;
34 import org.htmlunit.protocol.AnyHandler;
35 import org.htmlunit.protocol.javascript.JavaScriptURLConnection;
36
37 /**
38 * URL utilities class that makes it easy to create new URLs based off of old URLs
39 * without having to assemble or parse them yourself.
40 *
41 * @author Daniel Gredler
42 * @author Martin Tamme
43 * @author Sudhan Moghe
44 * @author Marc Guillemot
45 * @author Ahmed Ashour
46 * @author Ronald Brill
47 * @author Joerg Werner
48 * @author Hartmut Arlt
49 */
50 public final class UrlUtils {
51
52 /** "about". */
53 public static final String ABOUT = "about";
54 /** "about:". */
55 public static final String ABOUT_SCHEME = ABOUT + ":";
56 /** "about:blank". */
57 public static final String ABOUT_BLANK = ABOUT_SCHEME + "blank";
58 /** URL for "about:blank". */
59 public static final URL URL_ABOUT_BLANK;
60
61 private static final URLStreamHandler JS_HANDLER;
62 private static final URLStreamHandler ABOUT_HANDLER;
63 private static final URLStreamHandler DATA_HANDLER;
64
65 private static final BitSet PATH_ALLOWED_CHARS = new BitSet(256);
66 private static final BitSet QUERY_ALLOWED_CHARS = new BitSet(256);
67 private static final BitSet ANCHOR_ALLOWED_CHARS = new BitSet(256);
68 private static final BitSet HASH_ALLOWED_CHARS = new BitSet(256);
69
70 /*
71 URI allowed char initialization; based on HttpClient 3.1's URI bit sets.
72 */
73 static {
74 // make sure the handlers are available first (before calling toUrlSafe())
75 JS_HANDLER = new org.htmlunit.protocol.javascript.Handler();
76 ABOUT_HANDLER = new org.htmlunit.protocol.about.Handler();
77 DATA_HANDLER = new org.htmlunit.protocol.data.Handler();
78
79 try {
80 URL_ABOUT_BLANK = new URL(null, ABOUT_BLANK, ABOUT_HANDLER);
81 }
82 catch (final MalformedURLException e) {
83 // should never happen
84 throw new RuntimeException(e);
85 }
86
87 final BitSet reserved = new BitSet(256);
88 reserved.set(';');
89 reserved.set('/');
90 reserved.set('?');
91 reserved.set(':');
92 reserved.set('@');
93 reserved.set('&');
94 reserved.set('=');
95 reserved.set('+');
96 reserved.set('$');
97 reserved.set(',');
98
99 final BitSet mark = new BitSet(256);
100 mark.set('-');
101 mark.set('_');
102 mark.set('.');
103 mark.set('!');
104 mark.set('~');
105 mark.set('*');
106 mark.set('\'');
107 mark.set('(');
108 mark.set(')');
109
110 final BitSet alpha = new BitSet(256);
111 for (int i = 'a'; i <= 'z'; i++) {
112 alpha.set(i);
113 }
114 for (int i = 'A'; i <= 'Z'; i++) {
115 alpha.set(i);
116 }
117
118 final BitSet digit = new BitSet(256);
119 for (int i = '0'; i <= '9'; i++) {
120 digit.set(i);
121 }
122
123 final BitSet alphanumeric = new BitSet(256);
124 alphanumeric.or(alpha);
125 alphanumeric.or(digit);
126
127 final BitSet unreserved = new BitSet(256);
128 unreserved.or(alphanumeric);
129 unreserved.or(mark);
130
131 final BitSet hex = new BitSet(256);
132 hex.or(digit);
133 for (int i = 'a'; i <= 'f'; i++) {
134 hex.set(i);
135 }
136 for (int i = 'A'; i <= 'F'; i++) {
137 hex.set(i);
138 }
139
140 final BitSet escaped = new BitSet(256);
141 escaped.set('%');
142 escaped.or(hex);
143
144 final BitSet uric = new BitSet(256);
145 uric.or(reserved);
146 uric.or(unreserved);
147 uric.or(escaped);
148
149 final BitSet pchar = new BitSet(256);
150 pchar.or(unreserved);
151 pchar.or(escaped);
152 pchar.set(':');
153 pchar.set('@');
154 pchar.set('&');
155 pchar.set('=');
156 pchar.set('+');
157 pchar.set('$');
158 pchar.set(',');
159
160 final BitSet segment = new BitSet(256);
161 segment.or(pchar);
162 segment.set(';');
163 segment.or(pchar);
164
165 final BitSet pathSegments = new BitSet(256);
166 pathSegments.set('/');
167 pathSegments.or(segment);
168
169 final BitSet absPath = new BitSet(256);
170 absPath.set('/');
171 absPath.or(pathSegments);
172
173 final BitSet allowedAbsPath = new BitSet(256);
174 allowedAbsPath.or(absPath);
175
176 final BitSet allowedFragment = new BitSet(256);
177 allowedFragment.or(uric);
178
179 final BitSet allowedQuery = new BitSet(256);
180 allowedQuery.or(uric);
181
182 final BitSet allowedHash = new BitSet(256);
183 allowedHash.or(uric);
184
185 PATH_ALLOWED_CHARS.or(allowedAbsPath);
186 QUERY_ALLOWED_CHARS.or(allowedQuery);
187 ANCHOR_ALLOWED_CHARS.or(allowedFragment);
188 HASH_ALLOWED_CHARS.or(allowedHash);
189 }
190
191 /**
192 * Disallow instantiation of this class.
193 */
194 private UrlUtils() {
195 // Empty.
196 }
197
198 /**
199 * <p>Constructs a URL instance based on the specified URL string, taking into account the fact that the
200 * specified URL string may represent an <code>"about:..."</code> URL, a <code>"javascript:..."</code> URL, or
201 * a <code>data:...</code> URL.</p>
202 *
203 * <p>The caller should be sure that URL strings passed to this method will parse correctly as URLs, as
204 * this method never expects to have to handle {@link MalformedURLException}s.</p>
205 *
206 * @param url the URL string to convert into a URL instance
207 * @return the constructed URL instance
208 */
209 public static URL toUrlSafe(final String url) {
210 try {
211 return toUrlUnsafe(url);
212 }
213 catch (final MalformedURLException e) {
214 // Should never happen.
215 throw new RuntimeException(e);
216 }
217 }
218
219 /**
220 * <p>Constructs a URL instance based on the specified URL string, taking into account the fact that the
221 * specified URL string may represent an <code>"about:..."</code> URL, a <code>"javascript:..."</code> URL, or
222 * a <code>data:...</code> URL.</p>
223 *
224 * <p>Unlike {@link #toUrlSafe(String)}, the caller need not be sure that URL strings passed to this
225 * method will parse correctly as URLs.</p>
226 *
227 * @param url the URL string to convert into a URL instance
228 * @return the constructed URL instance
229 * @throws MalformedURLException if the URL string cannot be converted to a URL instance
230 */
231 public static URL toUrlUnsafe(final String url) throws MalformedURLException {
232 WebAssert.notNull("url", url);
233
234 final String protocol = StringUtils.substringBefore(url, ":").toLowerCase(Locale.ROOT);
235
236 if (protocol.isEmpty() || UrlUtils.isNormalUrlProtocol(protocol)) {
237 final URL response = new URL(url);
238 if (response.getProtocol().startsWith("http")
239 && StringUtils.isEmptyOrNull(response.getHost())) {
240 throw new MalformedURLException("Missing host name in url: " + url);
241 }
242 return response;
243 }
244
245 if (JavaScriptURLConnection.JAVASCRIPT_PREFIX.equals(protocol + ":")) {
246 return new URL(null, url, JS_HANDLER);
247 }
248
249 if (ABOUT.equals(protocol)) {
250 if (ABOUT_BLANK.equalsIgnoreCase(url)) {
251 return URL_ABOUT_BLANK;
252 }
253 return new URL(null, url, ABOUT_HANDLER);
254 }
255
256 if ("data".equals(protocol)) {
257 return new URL(null, url, DATA_HANDLER);
258 }
259
260 return new URL(null, url, AnyHandler.INSTANCE);
261 }
262
263 /**
264 * <p>Encodes illegal characters in the specified URL's path, query string and anchor according to the URL
265 * encoding rules observed in real browsers.</p>
266 *
267 * <p>For example, this method changes
268 * <code>"http://first/?a=b c"</code> to <code>"http://first/?a=b%20c"</code>.</p>
269 *
270 * @param url the URL to encode
271 * @param charset the charset
272 * @return the encoded URL
273 */
274 public static URL encodeUrl(final URL url, final Charset charset) {
275 if (!isNormalUrlProtocol(url.getProtocol())) {
276 return url; // javascript:, about:, data: and anything not supported like foo:
277 }
278
279 try {
280 String path = url.getPath();
281 if (path != null) {
282 path = encode(path, PATH_ALLOWED_CHARS, UTF_8);
283 }
284 String query = url.getQuery();
285 if (query != null) {
286 query = encode(query, QUERY_ALLOWED_CHARS, charset);
287 }
288 String anchor = url.getRef();
289 if (anchor != null) {
290 anchor = encode(anchor, ANCHOR_ALLOWED_CHARS, UTF_8);
291 }
292 return createNewUrl(url.getProtocol(), url.getUserInfo(), url.getHost(),
293 url.getPort(), path, anchor, query);
294 }
295 catch (final MalformedURLException e) {
296 // Impossible... I think.
297 throw new RuntimeException(e);
298 }
299 }
300
301 /**
302 * Encodes and escapes the specified URI anchor string.
303 *
304 * @param anchor the anchor string to encode and escape
305 * @return the encoded and escaped anchor string
306 */
307 public static String encodeAnchor(final String anchor) {
308 if (anchor == null) {
309 return null;
310 }
311 return encode(anchor, ANCHOR_ALLOWED_CHARS, UTF_8);
312 }
313
314 /**
315 * Encodes and escapes the specified URI hash string.
316 *
317 * @param hash the anchor string to encode and escape
318 * @return the encoded and escaped hash string
319 */
320 public static String encodeHash(final String hash) {
321 if (hash == null) {
322 return null;
323 }
324 return encode(hash, HASH_ALLOWED_CHARS, UTF_8);
325 }
326
327 /**
328 * Encodes and escapes the specified URI hash string.
329 *
330 * @param query the query string to encode and escape
331 * @return the encoded and escaped hash string
332 */
333 public static String encodeQuery(final String query) {
334 if (query == null) {
335 return null;
336 }
337 return encode(query, QUERY_ALLOWED_CHARS, UTF_8);
338 }
339
340 /**
341 * Unescapes and decodes the specified string.
342 *
343 * @param escaped the string to be unescaped and decoded
344 * @return the unescaped and decoded string
345 */
346 public static String decode(final String escaped) {
347 try {
348 final byte[] bytes = escaped.getBytes(US_ASCII);
349 final byte[] bytes2 = decodeUrl(bytes);
350 return new String(bytes2, UTF_8);
351 }
352 catch (final IllegalArgumentException e) {
353 // Should never happen.
354 throw new RuntimeException(e);
355 }
356 }
357
358 /**
359 * Escapes and encodes the specified string. Based on HttpClient 3.1's <code>URIUtil.encode()</code> method.
360 *
361 * @param unescaped the string to encode
362 * @param allowed allowed characters that shouldn't be escaped
363 * @param charset the charset to use
364 * @return the escaped string
365 */
366 private static String encode(final String unescaped, final BitSet allowed, final Charset charset) {
367 final byte[] bytes = unescaped.getBytes(charset);
368 final byte[] bytes2 = encodeUrl(allowed, bytes);
369 return encodePercentSign(bytes2);
370 }
371
372 /**
373 * Encodes every occurrence of the escape character '%' in the given input
374 * string that is not followed by two hexadecimal characters.
375 * @param input the input bytes
376 * @return the given input string where every occurrence of <code>%</code> in
377 * invalid escape sequences has been replace by <code>%25</code>
378 */
379 private static String encodePercentSign(final byte[] input) {
380 if (input == null) {
381 return null;
382 }
383
384 final StringBuilder result = new StringBuilder(new String(input, US_ASCII));
385 int state = -0;
386 int offset = 0;
387 for (int i = 0; i < input.length; i++) {
388 final byte b = input[i];
389 if (state == 0 && b == '%') {
390 state = 1;
391 }
392 else if (state == 1 || state == 2) {
393 if (('0' <= b && b <= '9')
394 || ('A' <= b && b <= 'F')
395 || ('a' <= b && b <= 'f')) {
396 state++;
397 if (state == 3) {
398 state = 0;
399 }
400 }
401 else {
402 final int st = i - state + offset;
403 result.replace(st, st + 1, "%25");
404 offset = offset + 2;
405 state = b == '%' ? 1 : 0;
406 }
407 }
408 }
409 if (state == 1 || state == 2) {
410 final int st = input.length - state + offset;
411 result.replace(st, st + 1, "%25");
412 }
413 return result.toString();
414 }
415
416 /**
417 * Creates and returns a new URL using only the protocol and authority from the given one.
418 * @param u the URL on which to base the returned URL
419 * @return a new URL using only the protocol and authority from the given one
420 * @throws MalformedURLException if there is a problem creating the new URL
421 */
422 public static URL getUrlWithoutPathRefQuery(final URL u) throws MalformedURLException {
423 return createNewUrl(u.getProtocol(), u.getAuthority(), null, null, null);
424 }
425
426 /**
427 * Creates and returns a new URL using only the protocol, authority and path
428 * from the given one.
429 * @param u the URL on which to base the returned URL
430 * @return a new URL using only the protocol and authority from the given one
431 * @throws MalformedURLException if there is a problem creating the new URL
432 */
433 public static URL getUrlWithoutRef(final URL u) throws MalformedURLException {
434 return createNewUrl(u.getProtocol(), u.getAuthority(), u.getPath(), null, u.getQuery());
435 }
436
437 /**
438 * Creates and returns a new URL identical to the specified URL, except using the specified protocol.
439 * @param u the URL on which to base the returned URL
440 * @param newProtocol the new protocol to use in the returned URL
441 * @return a new URL identical to the specified URL, except using the specified protocol
442 * @throws MalformedURLException if there is a problem creating the new URL
443 */
444 public static URL getUrlWithNewProtocol(final URL u, final String newProtocol) throws MalformedURLException {
445 return createNewUrl(newProtocol, u.getAuthority(), u.getPath(), u.getRef(), u.getQuery());
446 }
447
448 /**
449 * Creates and returns a new URL identical to the specified URL, except using the specified host.
450 * @param u the URL on which to base the returned URL
451 * @param newHost the new host to use in the returned URL
452 * @return a new URL identical to the specified URL, except using the specified host
453 * @throws MalformedURLException if there is a problem creating the new URL
454 */
455 public static URL getUrlWithNewHost(final URL u, final String newHost)
456 throws MalformedURLException {
457 return createNewUrl(u.getProtocol(), u.getUserInfo(), newHost,
458 u.getPort(), u.getPath(), u.getRef(), u.getQuery());
459 }
460
461 /**
462 * Creates and returns a new URL identical to the specified URL, except using the specified host.
463 * @param u the URL on which to base the returned URL
464 * @param newHost the new host to use in the returned URL
465 * @param newPort the new port to use in the returned URL
466 * @return a new URL identical to the specified URL, except using the specified host
467 * @throws MalformedURLException if there is a problem creating the new URL
468 */
469 public static URL getUrlWithNewHostAndPort(final URL u, final String newHost, final int newPort)
470 throws MalformedURLException {
471 return createNewUrl(u.getProtocol(), u.getUserInfo(), newHost, newPort, u.getPath(), u.getRef(), u.getQuery());
472 }
473
474 /**
475 * Creates and returns a new URL identical to the specified URL, except using the specified port.
476 * @param u the URL on which to base the returned URL
477 * @param newPort the new port to use in the returned URL or -1 to remove it
478 * @return a new URL identical to the specified URL, except using the specified port
479 * @throws MalformedURLException if there is a problem creating the new URL
480 */
481 public static URL getUrlWithNewPort(final URL u, final int newPort) throws MalformedURLException {
482 return createNewUrl(u.getProtocol(), u.getUserInfo(), u.getHost(),
483 newPort, u.getPath(), u.getRef(), u.getQuery());
484 }
485
486 /**
487 * Creates and returns a new URL identical to the specified URL, except using the specified path.
488 * @param u the URL on which to base the returned URL
489 * @param newPath the new path to use in the returned URL
490 * @return a new URL identical to the specified URL, except using the specified path
491 * @throws MalformedURLException if there is a problem creating the new URL
492 */
493 public static URL getUrlWithNewPath(final URL u, final String newPath) throws MalformedURLException {
494 return createNewUrl(u.getProtocol(), u.getAuthority(), newPath, u.getRef(), u.getQuery());
495 }
496
497 /**
498 * Creates and returns a new URL identical to the specified URL, except using the specified reference.
499 * @param u the URL on which to base the returned URL
500 * @param newRef the new reference to use in the returned URL or null to remove it
501 * @return a new URL identical to the specified URL, except using the specified reference
502 * @throws MalformedURLException if there is a problem creating the new URL
503 */
504 public static URL getUrlWithNewRef(final URL u, final String newRef) throws MalformedURLException {
505 return createNewUrl(u.getProtocol(), u.getAuthority(), u.getPath(), newRef, u.getQuery());
506 }
507
508 /**
509 * Creates and returns a new URL identical to the specified URL, except using the specified query string.
510 * @param u the URL on which to base the returned URL
511 * @param newQuery the new query string to use in the returned URL
512 * @return a new URL identical to the specified URL, except using the specified query string
513 * @throws MalformedURLException if there is a problem creating the new URL
514 */
515 public static URL getUrlWithNewQuery(final URL u, final String newQuery) throws MalformedURLException {
516 return createNewUrl(u.getProtocol(), u.getAuthority(), u.getPath(), u.getRef(), newQuery);
517 }
518
519 /**
520 * Creates and returns a new URL identical to the specified URL, ignoring path, protocol and query.
521 * @param u the URL on which to base the returned URL
522 * @return a new URL identical to the specified URL, ignoring path, protocol and query
523 * @throws MalformedURLException if there is a problem creating the new URL
524 */
525 public static URL getUrlWithProtocolAndAuthority(final URL u) throws MalformedURLException {
526 return createNewUrl(u.getProtocol(), u.getAuthority(), null, null, null);
527 }
528
529 /**
530 * Creates and returns a new URL identical to the specified URL but with a changed user name.
531 * @param u the URL on which to base the returned URL
532 * @param newUserName the new user name or null to remove it
533 * @return a new URL identical to the specified URL; only user name updated
534 * @throws MalformedURLException if there is a problem creating the new URL
535 */
536 public static URL getUrlWithNewUserName(final URL u, final String newUserName) throws MalformedURLException {
537 String newUserInfo = newUserName == null ? "" : newUserName;
538 final String userInfo = u.getUserInfo();
539 if (StringUtils.isNotBlank(userInfo)) {
540 final int colonIdx = userInfo.indexOf(':');
541 if (colonIdx > -1) {
542 newUserInfo = newUserInfo + userInfo.substring(colonIdx);
543 }
544 }
545 return createNewUrl(u.getProtocol(), newUserInfo.isEmpty() ? null : newUserInfo,
546 u.getHost(), u.getPort(), u.getPath(), u.getRef(), u.getQuery());
547 }
548
549 /**
550 * Creates and returns a new URL identical to the specified URL but with a changed user password.
551 * @param u the URL on which to base the returned URL
552 * @param newUserPassword the new user password or null to remove it
553 * @return a new URL identical to the specified URL; only user name updated
554 * @throws MalformedURLException if there is a problem creating the new URL
555 */
556 public static URL getUrlWithNewUserPassword(final URL u, final String newUserPassword)
557 throws MalformedURLException {
558 String newUserInfo = newUserPassword == null ? "" : ':' + newUserPassword;
559 final String userInfo = u.getUserInfo();
560 if (StringUtils.isNotBlank(userInfo)) {
561 final int colonIdx = userInfo.indexOf(':');
562 if (colonIdx > -1) {
563 newUserInfo = userInfo.substring(0, colonIdx) + newUserInfo;
564 }
565 else {
566 newUserInfo = userInfo + newUserInfo;
567 }
568 }
569 return createNewUrl(u.getProtocol(), newUserInfo.isEmpty() ? null : newUserInfo,
570 u.getHost(), u.getPort(), u.getPath(), u.getRef(), u.getQuery());
571 }
572
573 /**
574 * Creates a new URL based on the specified fragments.
575 * @param protocol the protocol to use (may not be {@code null})
576 * @param userInfo the user info to use (may be {@code null})
577 * @param host the host to use (may not be {@code null})
578 * @param port the port to use (may be <code>-1</code> if no port is specified)
579 * @param path the path to use (may be {@code null} and may omit the initial <code>'/'</code>)
580 * @param ref the reference to use (may be {@code null} and must not include the <code>'#'</code>)
581 * @param query the query to use (may be {@code null} and must not include the <code>'?'</code>)
582 * @return a new URL based on the specified fragments
583 * @throws MalformedURLException if there is a problem creating the new URL
584 */
585 private static URL createNewUrl(final String protocol, final String userInfo, final String host, final int port,
586 final String path, final String ref, final String query) throws MalformedURLException {
587 final StringBuilder s = new StringBuilder();
588 s.append(protocol).append("://");
589 if (userInfo != null) {
590 s.append(userInfo).append('@');
591 }
592 s.append(host);
593 if (port != -1) {
594 s.append(':').append(port);
595 }
596 if (path != null && !path.isEmpty()) {
597 if ('/' != path.charAt(0)) {
598 s.append('/');
599 }
600 s.append(path);
601 }
602 if (query != null) {
603 s.append('?').append(query);
604 }
605 if (ref != null) {
606 if (ref.isEmpty() || ref.charAt(0) != '#') {
607 s.append('#');
608 }
609 s.append(ref);
610 }
611
612 return new URL(s.toString());
613 }
614
615 /**
616 * Creates a new URL based on the specified fragments.
617 * @param protocol the protocol to use (may not be {@code null})
618 * @param authority the authority to use (may not be {@code null})
619 * @param path the path to use (may be {@code null} and may omit the initial <code>'/'</code>)
620 * @param ref the reference to use (may be {@code null} and must not include the <code>'#'</code>)
621 * @param query the query to use (may be {@code null} and must not include the <code>'?'</code>)
622 * @return a new URL based on the specified fragments
623 * @throws MalformedURLException if there is a problem creating the new URL
624 */
625 private static URL createNewUrl(final String protocol, final String authority,
626 final String path, final String ref, final String query) throws MalformedURLException {
627
628 // pre-compute length of StringBuilder
629 int len = protocol.length() + 1;
630 if (authority != null && !authority.isEmpty()) {
631 len += 2 + authority.length();
632 }
633 if (path != null) {
634 len += path.length();
635 }
636 if (query != null) {
637 len += 1 + query.length();
638 }
639 if (ref != null) {
640 len += 1 + ref.length();
641 }
642
643 final StringBuilder s = new StringBuilder(len);
644 s.append(protocol).append(':');
645 if (authority != null && !authority.isEmpty()) {
646 s.append("//").append(authority);
647 }
648 if (path != null) {
649 s.append(path);
650 }
651 if (query != null) {
652 s.append('?').append(query);
653 }
654 if (ref != null) {
655 if (ref.isEmpty() || ref.charAt(0) != '#') {
656 s.append('#');
657 }
658 s.append(ref);
659 }
660
661 return toUrlSafe(s.toString());
662 }
663
664 /**
665 * Resolves a given relative URL against a base URL. See
666 * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>
667 * Section 4 for more details.
668 *
669 * @param baseUrl The base URL in which to resolve the specification.
670 * @param relativeUrl The relative URL to resolve against the base URL.
671 * @return the resolved specification.
672 */
673 public static String resolveUrl(final String baseUrl, final String relativeUrl) {
674 if (baseUrl == null) {
675 throw new IllegalArgumentException("Base URL must not be null");
676 }
677 if (relativeUrl == null) {
678 throw new IllegalArgumentException("Relative URL must not be null");
679 }
680 final Url url = resolveUrl(parseUrl(baseUrl), relativeUrl);
681
682 return url.toString();
683 }
684
685 /**
686 * Resolves a given relative URL against a base URL. See
687 * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>
688 * Section 4 for more details.
689 *
690 * @param baseUrl The base URL in which to resolve the specification.
691 * @param relativeUrl The relative URL to resolve against the base URL.
692 * @return the resolved specification.
693 */
694 public static String resolveUrl(final URL baseUrl, final String relativeUrl) {
695 if (baseUrl == null) {
696 throw new IllegalArgumentException("Base URL must not be null");
697 }
698 return resolveUrl(baseUrl.toExternalForm(), relativeUrl);
699 }
700
701 /**
702 * Parses a given specification using the algorithm depicted in
703 * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
704 * <p>
705 * Section 2.4: Parsing a URL
706 * <p>
707 * An accepted method for parsing URLs is useful to clarify the
708 * generic-RL syntax of Section 2.2 and to describe the algorithm for
709 * resolving relative URLs presented in Section 4. This section
710 * describes the parsing rules for breaking down a URL (relative or
711 * absolute) into the component parts described in Section 2.1. The
712 * rules assume that the URL has already been separated from any
713 * surrounding text and copied to a "parse string". The rules are
714 * listed in the order in which they would be applied by the parser.
715 *
716 * @param spec The specification to parse.
717 * @return the parsed specification.
718 */
719 private static Url parseUrl(String spec) {
720 final Url url = new Url();
721 int startIndex = 0;
722 int endIndex = spec.length();
723
724 // see https://url.spec.whatwg.org/#concept-basic-url-parser
725 // * If input contains any leading or trailing C0 control or space, validation error.
726 // Remove any leading and trailing C0 control or space from input.
727 // * If input contains any ASCII tab or newline, validation error.
728 // Remove all ASCII tab or newline from input.
729
730 if (endIndex > startIndex) {
731 StringBuilder sb = null;
732 boolean before = true;
733 int trailing = 0;
734
735 for (int i = 0; i < endIndex; i++) {
736 final char c = spec.charAt(i);
737 boolean remove = false;
738
739 if (c == '\t' | c == '\r' | c == '\n') {
740 remove = true;
741 }
742 else if ('\u0000' <= c && c <= '\u0020') {
743 if (before) {
744 remove = true;
745 }
746 else {
747 trailing++;
748 }
749 }
750 else {
751 before = false;
752 trailing = 0;
753 }
754
755 if (remove) {
756 if (sb == null) {
757 sb = new StringBuilder(spec.substring(0, i));
758 }
759 }
760 else if (sb != null) {
761 sb.append(c);
762 }
763 }
764
765 if (sb == null) {
766 if (trailing > 0) {
767 endIndex = spec.length() - trailing;
768 spec = spec.substring(0, endIndex);
769 }
770 }
771 else {
772 if (trailing > 0) {
773 spec = sb.substring(0, sb.length() - trailing);
774 }
775 else {
776 spec = sb.toString();
777 }
778 endIndex = spec.length();
779 }
780 }
781
782 // Section 2.4.1: Parsing the Fragment Identifier
783 //
784 // If the parse string contains a crosshatch "#" character, then the
785 // substring after the first (left-most) crosshatch "#" and up to the
786 // end of the parse string is the <fragment> identifier. If the
787 // crosshatch is the last character, or no crosshatch is present, then
788 // the fragment identifier is empty. The matched substring, including
789 // the crosshatch character, is removed from the parse string before
790 // continuing.
791 //
792 // Note that the fragment identifier is not considered part of the URL.
793 // However, since it is often attached to the URL, parsers must be able
794 // to recognize and set aside fragment identifiers as part of the
795 // process.
796 final int crosshatchIndex = StringUtils.indexOf(spec, '#', startIndex, endIndex);
797
798 if (crosshatchIndex >= 0) {
799 url.fragment_ = spec.substring(crosshatchIndex + 1, endIndex);
800 endIndex = crosshatchIndex;
801 }
802 // Section 2.4.2: Parsing the Scheme
803 //
804 // If the parse string contains a colon ":" after the first character
805 // and before any characters not allowed as part of a scheme name (i.e.,
806 // any not an alphanumeric, plus "+", period ".", or hyphen "-"), the
807 // <scheme> of the URL is the substring of characters up to but not
808 // including the first colon. These characters and the colon are then
809 // removed from the parse string before continuing.
810 final int colonIndex = StringUtils.indexOf(spec, ':', startIndex, endIndex);
811
812 if (colonIndex > 0) {
813 final String scheme = spec.substring(startIndex, colonIndex);
814 if (isValidScheme(scheme)) {
815 url.scheme_ = scheme;
816 startIndex = colonIndex + 1;
817 }
818 }
819 // Section 2.4.3: Parsing the Network Location/Login
820 //
821 // If the parse string begins with a double-slash "//", then the
822 // substring of characters after the double-slash and up to, but not
823 // including, the next slash "/" character is the network location/login
824 // (<net_loc>) of the URL. If no trailing slash "/" is present, the
825 // entire remaining parse string is assigned to <net_loc>. The double-
826 // slash and <net_loc> are removed from the parse string before
827 // continuing.
828 //
829 // Note: We also accept a question mark "?" or a semicolon ";" character as
830 // delimiters for the network location/login (<net_loc>) of the URL.
831 final int locationStartIndex;
832 int locationEndIndex;
833
834 if (spec.startsWith("//", startIndex)) {
835 locationStartIndex = startIndex + 2;
836 locationEndIndex = StringUtils.indexOf(spec, '/', locationStartIndex, endIndex);
837 if (locationEndIndex >= 0) {
838 startIndex = locationEndIndex;
839 }
840 }
841 else {
842 locationStartIndex = -1;
843 locationEndIndex = -1;
844 }
845 // Section 2.4.4: Parsing the Query Information
846 //
847 // If the parse string contains a question mark "?" character, then the
848 // substring after the first (left-most) question mark "?" and up to the
849 // end of the parse string is the <query> information. If the question
850 // mark is the last character, or no question mark is present, then the
851 // query information is empty. The matched substring, including the
852 // question mark character, is removed from the parse string before
853 // continuing.
854 final int questionMarkIndex = StringUtils.indexOf(spec, '?', startIndex, endIndex);
855
856 if (questionMarkIndex >= 0) {
857 if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
858 // The substring of characters after the double-slash and up to, but not
859 // including, the question mark "?" character is the network location/login
860 // (<net_loc>) of the URL.
861 locationEndIndex = questionMarkIndex;
862 startIndex = questionMarkIndex;
863 }
864 url.query_ = spec.substring(questionMarkIndex + 1, endIndex);
865 endIndex = questionMarkIndex;
866 }
867 // Section 2.4.5: Parsing the Parameters
868 //
869 // If the parse string contains a semicolon ";" character, then the
870 // substring after the first (left-most) semicolon ";" and up to the end
871 // of the parse string is the parameters (<params>). If the semicolon
872 // is the last character, or no semicolon is present, then <params> is
873 // empty. The matched substring, including the semicolon character, is
874 // removed from the parse string before continuing.
875 final int semicolonIndex = StringUtils.indexOf(spec, ';', startIndex, endIndex);
876
877 if (semicolonIndex >= 0) {
878 if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
879 // The substring of characters after the double-slash and up to, but not
880 // including, the semicolon ";" character is the network location/login
881 // (<net_loc>) of the URL.
882 locationEndIndex = semicolonIndex;
883 startIndex = semicolonIndex;
884 }
885 url.parameters_ = spec.substring(semicolonIndex + 1, endIndex);
886 endIndex = semicolonIndex;
887 }
888 // Section 2.4.6: Parsing the Path
889 //
890 // After the above steps, all that is left of the parse string is the
891 // URL <path> and the slash "/" that may precede it. Even though the
892 // initial slash is not part of the URL path, the parser must remember
893 // whether or not it was present so that later processes can
894 // differentiate between relative and absolute paths. Often this is
895 // done by simply storing the preceding slash along with the path.
896 if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
897 // The entire remaining parse string is assigned to the network
898 // location/login (<net_loc>) of the URL.
899 locationEndIndex = endIndex;
900 }
901 else if (startIndex < endIndex) {
902 url.path_ = spec.substring(startIndex, endIndex);
903 }
904 // Set the network location/login (<net_loc>) of the URL.
905 if ((locationStartIndex >= 0) && (locationEndIndex >= 0)) {
906 url.location_ = spec.substring(locationStartIndex, locationEndIndex);
907 }
908 return url;
909 }
910
911 /**
912 * Returns true if specified string is a valid scheme name.
913 * <p>
914 * https://tools.ietf.org/html/rfc1738
915 * <p>
916 * Scheme names consist of a sequence of characters. The lower case
917 * letters "a"--"z", digits, and the characters plus ("+"), period
918 * ("."), and hyphen ("-") are allowed. For resiliency, programs
919 * interpreting URLs should treat upper case letters as equivalent to
920 * lower case in scheme names (e.g., allow "HTTP" as well as "http").
921 *
922 * @param scheme the scheme string to check
923 * @return true if valid
924 */
925 public static boolean isValidScheme(final String scheme) {
926 final int length = scheme.length();
927 if (length < 1) {
928 return false;
929 }
930
931 char c = scheme.charAt(0);
932 boolean isValid = ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
933 if (!isValid) {
934 return false;
935 }
936
937 for (int i = 1; i < length; i++) {
938 c = scheme.charAt(i);
939 isValid =
940 ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')
941 || ('0' <= c && c <= '9')
942 || c == '+'
943 || c == '.'
944 || c == '-';
945 if (!isValid) {
946 return false;
947 }
948 }
949
950 return true;
951 }
952
953 /**
954 * Returns true if specified string is a special scheme.
955 * see <a href='https://url.spec.whatwg.org/#special-scheme'>
956 * https://url.spec.whatwg.org/#special-scheme</a>
957 *
958 * @param scheme the scheme string to check
959 * @return true if special
960 */
961 public static boolean isSpecialScheme(final String scheme) {
962 final int length = scheme.length();
963 if (length < 2 || length > 5) {
964 return false;
965 }
966
967 final String schemeLC = scheme.toLowerCase(Locale.ROOT);
968 return "ftp".equals(schemeLC)
969 || "file".equals(schemeLC)
970 || "http".equals(schemeLC)
971 || "https".equals(schemeLC)
972 || "ws".equals(schemeLC)
973 || "wss".equals(schemeLC);
974 }
975
976 /**
977 * Resolves a given relative URL against a base URL using the algorithm
978 * depicted in <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
979 * <p>
980 * Section 4: Resolving Relative URLs
981 * <p>
982 * This section describes an example algorithm for resolving URLs within
983 * a context in which the URLs may be relative, such that the result is
984 * always a URL in absolute form. Although this algorithm cannot
985 * guarantee that the resulting URL will equal that intended by the
986 * original author, it does guarantee that any valid URL (relative or
987 * absolute) can be consistently transformed to an absolute form given a
988 * valid base URL.
989 *
990 * @param baseUrl The base URL in which to resolve the specification.
991 * @param relativeUrl The relative URL to resolve against the base URL.
992 * @return the resolved specification.
993 */
994 private static Url resolveUrl(final Url baseUrl, final String relativeUrl) {
995 final Url url = parseUrl(relativeUrl);
996 // Step 1: The base URL is established according to the rules of
997 // Section 3. If the base URL is the empty string (unknown),
998 // the embedded URL is interpreted as an absolute URL and
999 // we are done.
1000 if (baseUrl == null) {
1001 return url;
1002 }
1003 // Step 2: Both the base and embedded URLs are parsed into their
1004 // component parts as described in Section 2.4.
1005 // a) If the embedded URL is entirely empty, it inherits the
1006 // entire base URL (i.e., is set equal to the base URL)
1007 // and we are done.
1008 if (relativeUrl.isEmpty()) {
1009 return new Url(baseUrl);
1010 }
1011 // b) If the embedded URL starts with a scheme name, it is
1012 // interpreted as an absolute URL and we are done.
1013 if (url.scheme_ != null) {
1014 return url;
1015 }
1016 // c) Otherwise, the embedded URL inherits the scheme of
1017 // the base URL.
1018 url.scheme_ = baseUrl.scheme_;
1019 // Step 3: If the embedded URL's <net_loc> is non-empty, we skip to
1020 // Step 7. Otherwise, the embedded URL inherits the <net_loc>
1021 // (if any) of the base URL.
1022 if (url.location_ != null) {
1023 return url;
1024 }
1025 url.location_ = baseUrl.location_;
1026 // Step 4: If the embedded URL path is preceded by a slash "/", the
1027 // path is not relative and we skip to Step 7.
1028 if (url.path_ != null && !url.path_.isEmpty() && url.path_.charAt(0) == '/') {
1029 url.path_ = removeLeadingSlashPoints(url.path_);
1030 return url;
1031 }
1032 // Step 5: If the embedded URL path is empty (and not preceded by a
1033 // slash), then the embedded URL inherits the base URL path,
1034 // and
1035 if (url.path_ == null) {
1036 url.path_ = baseUrl.path_;
1037 // a) if the embedded URL's <params> is non-empty, we skip to
1038 // step 7; otherwise, it inherits the <params> of the base
1039 // URL (if any) and
1040 if (url.parameters_ != null) {
1041 return url;
1042 }
1043 url.parameters_ = baseUrl.parameters_;
1044 // b) if the embedded URL's <query> is non-empty, we skip to
1045 // step 7; otherwise, it inherits the <query> of the base
1046 // URL (if any) and we skip to step 7.
1047 if (url.query_ != null) {
1048 return url;
1049 }
1050 url.query_ = baseUrl.query_;
1051 return url;
1052 }
1053 // Step 6: The last segment of the base URL's path (anything
1054 // following the rightmost slash "/", or the entire path if no
1055 // slash is present) is removed and the embedded URL's path is
1056 // appended in its place. The following operations are
1057 // then applied, in order, to the new path:
1058 final String basePath = baseUrl.path_;
1059 String path = "";
1060
1061 if (basePath == null) {
1062 path = "/";
1063 }
1064 else {
1065 final int lastSlashIndex = basePath.lastIndexOf('/');
1066
1067 if (lastSlashIndex >= 0) {
1068 path = basePath.substring(0, lastSlashIndex + 1);
1069 }
1070 }
1071
1072 path = path.concat(url.path_);
1073 // a) All occurrences of "./", where "." is a complete path
1074 // segment, are removed.
1075 int pathSegmentIndex;
1076
1077 while ((pathSegmentIndex = path.indexOf("/./")) >= 0) {
1078 path = path.substring(0, pathSegmentIndex + 1).concat(path.substring(pathSegmentIndex + 3));
1079 }
1080 // b) If the path ends with "." as a complete path segment,
1081 // that "." is removed.
1082 if (path.endsWith("/.")) {
1083 path = path.substring(0, path.length() - 1);
1084 }
1085 // c) All occurrences of "<segment>/../", where <segment> is a
1086 // complete path segment not equal to "..", are removed.
1087 // Removal of these path segments is performed iteratively,
1088 // removing the leftmost matching pattern on each iteration,
1089 // until no matching pattern remains.
1090 while ((pathSegmentIndex = path.indexOf("/../")) > 0) {
1091 final String pathSegment = path.substring(0, pathSegmentIndex);
1092 final int slashIndex = pathSegment.lastIndexOf('/');
1093
1094 if (slashIndex >= 0) {
1095 if (!"..".equals(pathSegment.substring(slashIndex))) {
1096 path = path.substring(0, slashIndex + 1).concat(path.substring(pathSegmentIndex + 4));
1097 }
1098 }
1099 else {
1100 path = path.substring(pathSegmentIndex + 4);
1101 }
1102 }
1103 // d) If the path ends with "<segment>/..", where <segment> is a
1104 // complete path segment not equal to "..", that
1105 // "<segment>/.." is removed.
1106 if (path.endsWith("/..")) {
1107 final String pathSegment = path.substring(0, path.length() - 3);
1108 final int slashIndex = pathSegment.lastIndexOf('/');
1109
1110 if (slashIndex >= 0) {
1111 path = path.substring(0, slashIndex + 1);
1112 }
1113 }
1114
1115 path = removeLeadingSlashPoints(path);
1116
1117 url.path_ = path;
1118 // Step 7: The resulting URL components, including any inherited from
1119 // the base URL, are recombined to give the absolute form of
1120 // the embedded URL.
1121 return url;
1122 }
1123
1124 /**
1125 * "../" after the leading "/" should be removed as browsers do (not in RFC)
1126 */
1127 private static String removeLeadingSlashPoints(final String path) {
1128 int i = 1;
1129 while (path.startsWith("../", i)) {
1130 i = i + 3;
1131 }
1132
1133 if (i > 1) {
1134 return "/" + path.substring(i);
1135 }
1136
1137 return path;
1138 }
1139
1140 /**
1141 * Class <code>Url</code> represents a Uniform Resource Locator.
1142 *
1143 * @author Martin Tamme
1144 */
1145 private static class Url {
1146
1147 private String scheme_;
1148 private String location_;
1149 private String path_;
1150 private String parameters_;
1151 private String query_;
1152 private String fragment_;
1153
1154 /**
1155 * Creates a <code>Url</code> object.
1156 */
1157 Url() {
1158 super();
1159 }
1160
1161 /**
1162 * Creates a <code>Url</code> object from the specified
1163 * <code>Url</code> object.
1164 *
1165 * @param url a <code>Url</code> object.
1166 */
1167 Url(final Url url) {
1168 scheme_ = url.scheme_;
1169 location_ = url.location_;
1170 path_ = url.path_;
1171 parameters_ = url.parameters_;
1172 query_ = url.query_;
1173 fragment_ = url.fragment_;
1174 }
1175
1176 /**
1177 * Returns a string representation of the <code>Url</code> object.
1178 *
1179 * @return a string representation of the <code>Url</code> object.
1180 */
1181 @Override
1182 public String toString() {
1183 final StringBuilder sb = new StringBuilder();
1184
1185 if (scheme_ != null) {
1186 sb.append(scheme_).append(':');
1187 }
1188 if (location_ != null) {
1189 sb.append("//").append(location_);
1190 }
1191 if (path_ != null) {
1192 sb.append(path_);
1193 }
1194 if (parameters_ != null) {
1195 sb.append(';').append(parameters_);
1196 }
1197 if (query_ != null) {
1198 sb.append('?').append(query_);
1199 }
1200 if (fragment_ != null) {
1201 sb.append('#').append(fragment_);
1202 }
1203 return sb.toString();
1204 }
1205 }
1206
1207 static boolean isNormalUrlProtocol(final String protocol) {
1208 return "http".equals(protocol) || "https".equals(protocol) || "file".equals(protocol);
1209 }
1210
1211 /**
1212 * More or less the same as sameFile(URL, URL) but without
1213 * resolving the host to an IP address for comparing.
1214 * Additionally we do some path normalization.
1215 *
1216 * @param u1 a URL object
1217 * @param u2 a URL object
1218 * @return true if u1 and u2 refer to the same file
1219 */
1220 public static boolean sameFile(final URL u1, final URL u2) {
1221 if (u1 == u2) {
1222 return true;
1223 }
1224 if (u1 == null || u2 == null) {
1225 return false;
1226 }
1227
1228 // Compare the protocols.
1229 final String p1 = u1.getProtocol();
1230 final String p2 = u2.getProtocol();
1231 if (!(p1 == p2 || (p1 != null && p1.equalsIgnoreCase(p2)))) {
1232 return false;
1233 }
1234
1235 // Compare the ports.
1236 final int port1 = (u1.getPort() == -1) ? u1.getDefaultPort() : u1.getPort();
1237 final int port2 = (u2.getPort() == -1) ? u2.getDefaultPort() : u2.getPort();
1238 if (port1 != port2) {
1239 return false;
1240 }
1241
1242 // Compare the hosts.
1243 final String h1 = u1.getHost();
1244 final String h2 = u2.getHost();
1245 if (!(h1 == h2 || (h1 != null && h1.equalsIgnoreCase(h2)))) {
1246 return false;
1247 }
1248
1249 // Compare the files.
1250 String f1 = u1.getFile();
1251 if (f1.isEmpty()) {
1252 f1 = "/";
1253 }
1254 String f2 = u2.getFile();
1255 if (f2.isEmpty()) {
1256 f2 = "/";
1257 }
1258 if (f1.indexOf('.') > 0 || f2.indexOf('.') > 0) {
1259 try {
1260 f1 = u1.toURI().normalize().toURL().getFile();
1261 f2 = u2.toURI().normalize().toURL().getFile();
1262 }
1263 catch (final RuntimeException e) {
1264 throw e;
1265 }
1266 catch (final Exception ignored) {
1267 // ignore
1268 }
1269 }
1270
1271 return Objects.equals(f1, f2);
1272 }
1273
1274 /**
1275 * Helper that constructs a normalized url string
1276 * usable as cache key.
1277 *
1278 * @param url a URL object
1279 * @return the normalized string
1280 */
1281 public static String normalize(final URL url) {
1282 final StringBuilder result = new StringBuilder();
1283 result.append(url.getProtocol())
1284 .append("://")
1285 .append(url.getHost())
1286 .append(':')
1287 .append((url.getPort() == -1) ? url.getDefaultPort() : url.getPort());
1288
1289 // Compare the files.
1290 String f = url.getFile();
1291 if (f.isEmpty()) {
1292 result.append('/');
1293 }
1294 else {
1295 if (f.indexOf('.') > 0) {
1296 try {
1297 f = url.toURI().normalize().toURL().getFile();
1298 }
1299 catch (final Exception ignored) {
1300 // ignore
1301 }
1302 }
1303 result.append(f);
1304 }
1305
1306 return result.toString();
1307 }
1308
1309 /**
1310 * Constructs a {@link URI} using the specified URL.
1311 *
1312 * @param url the URL
1313 * @param query the query
1314 *
1315 * @throws URISyntaxException
1316 * If both a scheme and a path are given but the path is
1317 * relative, if the URI string constructed from the given
1318 * components violates RFC 2396, or if the authority
1319 * component of the string is present but cannot be parsed
1320 * as a server-based authority
1321 * @return the URI
1322 */
1323 public static URI toURI(final URL url, final String query) throws URISyntaxException {
1324 final String scheme = url.getProtocol();
1325 final String host = url.getHost();
1326 final int port = url.getPort();
1327 final String path = url.getPath();
1328 final StringBuilder buffer = new StringBuilder();
1329 if (host != null) {
1330 if (scheme != null) {
1331 buffer.append(scheme).append("://");
1332 }
1333 buffer.append(host);
1334 if (port > 0) {
1335 buffer.append(':').append(port);
1336 }
1337 }
1338 if (path == null || path.isEmpty() || path.charAt(0) != '/') {
1339 buffer.append('/');
1340 }
1341 if (path != null) {
1342 buffer.append(path);
1343 }
1344 if (query != null) {
1345 buffer.append('?').append(query);
1346 }
1347 return new URI(buffer.toString());
1348 }
1349
1350 /**
1351 * @param part the part to encode
1352 * @return the ecoded string
1353 */
1354 public static String encodeQueryPart(final String part) {
1355 if (part == null || part.isEmpty()) {
1356 return "";
1357 }
1358
1359 try {
1360 return URLEncoder.encode(part, "UTF-8");
1361 }
1362 catch (final UnsupportedEncodingException e) {
1363 return part;
1364 }
1365 }
1366
1367 /**
1368 * Removes the well known ports if it can be deduced from protocol.
1369 * @param url the url to clean up
1370 * @return a new URL without the port or the given one
1371 * @throws MalformedURLException if the URL string cannot be converted to a URL instance
1372 */
1373 public static URL removeRedundantPort(final URL url) throws MalformedURLException {
1374 if (("https".equals(url.getProtocol()) && url.getPort() == 443)
1375 || ("http".equals(url.getProtocol()) && url.getPort() == 80)) {
1376 return getUrlWithNewPort(url, -1);
1377 }
1378 return url;
1379 }
1380
1381 /**
1382 * Decodes an array of URL safe 7-bit characters into an array of original bytes.
1383 * Escaped characters are converted back to their original representation.
1384 * @param bytes array of URL safe characters
1385 * @param removeWhitespace if true don't add whitespace chars to the output
1386 * @return array of original bytes
1387 * @throws IllegalArgumentException in case of error
1388 */
1389 public static byte[] decodeDataUrl(final byte[] bytes, final boolean removeWhitespace)
1390 throws IllegalArgumentException {
1391 // adapted from apache commons codec
1392 if (bytes == null) {
1393 return null;
1394 }
1395 final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
1396 for (int i = 0; i < bytes.length; i++) {
1397 int b = bytes[i];
1398 if (b == '%') {
1399 try {
1400 final int u = digit16(bytes[++i]);
1401 final int l = digit16(bytes[++i]);
1402 b = (u << 4) + l;
1403 }
1404 catch (final ArrayIndexOutOfBoundsException e) {
1405 throw new IllegalArgumentException("Invalid URL encoding: ", e);
1406 }
1407 }
1408 if (removeWhitespace
1409 && (b == 9 || b == 10 || b == 12 || b == 13 || b == 32)) {
1410 continue;
1411 }
1412
1413 buffer.write(b);
1414 }
1415 return buffer.toByteArray();
1416 }
1417
1418 /**
1419 * Decodes an array of URL safe 7-bit characters into an array of original bytes.
1420 * Escaped characters are converted back to their original representation.
1421 * @param bytes array of URL safe characters
1422 * @return array of original bytes
1423 * @throws IllegalArgumentException in case of error
1424 */
1425 public static byte[] decodeUrl(final byte[] bytes) throws IllegalArgumentException {
1426 // adapted from apache commons codec
1427 if (bytes == null) {
1428 return null;
1429 }
1430 final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
1431 for (int i = 0; i < bytes.length; i++) {
1432 final int b = bytes[i];
1433 if (b == '+') {
1434 buffer.write(' ');
1435 }
1436 else if (b == '%') {
1437 try {
1438 final int u = digit16(bytes[++i]);
1439 final int l = digit16(bytes[++i]);
1440 buffer.write((char) ((u << 4) + l));
1441 }
1442 catch (final ArrayIndexOutOfBoundsException e) {
1443 throw new IllegalArgumentException("Invalid URL encoding: ", e);
1444 }
1445 }
1446 else {
1447 buffer.write(b);
1448 }
1449 }
1450 return buffer.toByteArray();
1451 }
1452
1453 private static int digit16(final byte b) throws IllegalArgumentException {
1454 final int i = Character.digit((char) b, 16);
1455 if (i == -1) {
1456 throw new IllegalArgumentException("Invalid URL encoding: not a valid digit (radix 16): " + b);
1457 }
1458 return i;
1459 }
1460
1461 /**
1462 * Encodes an array of bytes into an array of URL safe 7-bit characters. Unsafe characters are escaped.
1463 * @param urlsafe bitset of characters deemed URL safe
1464 * @param bytes array of bytes to convert to URL safe characters
1465 * @return array of bytes containing URL safe characters
1466 */
1467 public static byte[] encodeUrl(final BitSet urlsafe, final byte[] bytes) {
1468 // adapted from apache commons codec
1469 if (bytes == null) {
1470 return null;
1471 }
1472
1473 final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
1474 for (final byte c : bytes) {
1475 int b = c;
1476 if (b < 0) {
1477 b = 256 + b;
1478 }
1479 if (urlsafe.get(b)) {
1480 if (b == ' ') {
1481 b = '+';
1482 }
1483 buffer.write(b);
1484 }
1485 else {
1486 buffer.write('%');
1487 final char hex1 = hexDigit(b >> 4);
1488 final char hex2 = hexDigit(b);
1489 buffer.write(hex1);
1490 buffer.write(hex2);
1491 }
1492 }
1493 return buffer.toByteArray();
1494 }
1495
1496 private static char hexDigit(final int b) {
1497 return Character.toUpperCase(Character.forDigit(b & 0xF, 16));
1498 }
1499
1500 /**
1501 * Determines whether two URLs share the same origin according to the Same-Origin Policy.
1502 * Two URLs are considered to have the same origin if they have the same protocol (scheme),
1503 * host, and port.
1504 *
1505 * <p>The method handles default ports correctly by using the URL's default port when
1506 * the explicit port is -1 (indicating no port was specified).
1507 *
1508 * @param originUrl the first URL to compare (must not be null)
1509 * @param newUrl the second URL to compare (must not be null)
1510 * @return {@code true} if both URLs have the same host and effective port; {@code false} otherwise
1511 */
1512 public static boolean isSameOrigin(final URL originUrl, final URL newUrl) {
1513 if (!originUrl.getProtocol().equals(newUrl.getProtocol())) {
1514 return false;
1515 }
1516
1517 if (!originUrl.getHost().equalsIgnoreCase(newUrl.getHost())) {
1518 return false;
1519 }
1520
1521 int originPort = originUrl.getPort();
1522 if (originPort == -1) {
1523 originPort = originUrl.getDefaultPort();
1524 }
1525 int newPort = newUrl.getPort();
1526 if (newPort == -1) {
1527 newPort = newUrl.getDefaultPort();
1528 }
1529 return originPort == newPort;
1530 }
1531 }