internal/runtime/URIUtils.java

/*
 * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package jdk.nashorn.internal.runtime;

import static jdk.nashorn.internal.runtime.ECMAErrors.uriError;

/**
 * URI handling global functions. ECMA 15.1.3 URI Handling Function Properties
 *
 */
public final class URIUtils {

    private URIUtils() {
    }

    static String encodeURI(final Object self, final String string) {
        return encode(self, string, false);
    }

    static String encodeURIComponent(final Object self, final String string) {
        return encode(self, string, true);
    }

    static String decodeURI(final Object self, final String string) {
        return decode(self, string, false);
    }

    static String decodeURIComponent(final Object self, final String string) {
        return decode(self, string, true);
    }

    // abstract encode function
    private static String encode(final Object self, final String string, final boolean component) {
        if (string.isEmpty()) {
            return string;
        }

        final int len = string.length();
        final StringBuilder sb = new StringBuilder();

        for (int k = 0; k < len; k++) {
            final char C = string.charAt(k);
            if (isUnescaped(C, component)) {
                sb.append(C);
                continue;
            }

            if (C >= 0xDC00 && C <= 0xDFFF) {
                return error(string, k);
            }

            int V;
            if (C < 0xD800 || C > 0xDBFF) {
                V = C;
            } else {
                k++;
                if (k == len) {
                    return error(string, k);
                }

                final char kChar = string.charAt(k);
                if (kChar < 0xDC00 || kChar > 0xDFFF) {
                    return error(string, k);
                }
                V = ((C - 0xD800) * 0x400 + (kChar - 0xDC00) + 0x10000);
            }

            try {
                sb.append(toHexEscape(V));
            } catch (final Exception e) {
                throw uriError(e, "bad.uri", string, Integer.toString(k));
            }
        }

        return sb.toString();
    }

    // abstract decode function
    private static String decode(final Object self, final String string, final boolean component) {
        if (string.isEmpty()) {
            return string;
        }

        final int           len = string.length();
        final StringBuilder sb  = new StringBuilder();

        for (int k = 0; k < len; k++) {
            final char ch = string.charAt(k);
            if (ch != '%') {
                sb.append(ch);
                continue;
            }
            final int start = k;
            if (k + 2 >= len) {
                return error(string, k);
            }

            int B = toHexByte(string.charAt(k + 1), string.charAt(k + 2));
            if (B < 0) {
                return error(string, k + 1);
            }

            k += 2;
            char C;
            // Most significant bit is zero
            if ((B & 0x80) == 0) {
                C = (char) B;
                if (!component && URI_RESERVED.indexOf(C) >= 0) {
                    for (int j = start; j <= k; j++) {
                        sb.append(string.charAt(j));
                    }
                } else {
                    sb.append(C);
                }
            } else {
                // n is utf8 length, V is codepoint and minV is lower bound
                int n, V, minV;

                if ((B & 0xC0) == 0x80) {
                    // 10xxxxxx - illegal first byte
                    return error(string, k);
                } else if ((B & 0x20) == 0) {
                    // 110xxxxx 10xxxxxx
                    n = 2;
                    V = B & 0x1F;
                    minV = 0x80;
                } else if ((B & 0x10) == 0) {
                    // 1110xxxx 10xxxxxx 10xxxxxx
                    n = 3;
                    V = B & 0x0F;
                    minV = 0x800;
                } else if ((B & 0x08) == 0) {
                    // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                    n = 4;
                    V = B & 0x07;
                    minV = 0x10000;
                } else if ((B & 0x04) == 0) {
                    // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
                    n = 5;
                    V =  B & 0x03;
                    minV = 0x200000;
                } else if ((B & 0x02) == 0) {
                    // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
                    n = 6;
                    V = B & 0x01;
                    minV = 0x4000000;
                } else {
                    return error(string, k);
                }

                // check bound for sufficient chars
                if (k + (3*(n-1)) >= len) {
                    return error(string, k);
                }

                for (int j = 1; j < n; j++) {
                    k++;
                    if (string.charAt(k) != '%') {
                        return error(string, k);
                    }

                    B = toHexByte(string.charAt(k + 1), string.charAt(k + 2));
                    if (B < 0 || (B & 0xC0) != 0x80) {
                        return error(string, k + 1);
                    }

                    V = (V << 6) | (B & 0x3F);
                    k += 2;
                }

                // Check for overlongs and invalid codepoints.
                // The high and low surrogate halves used by UTF-16
                // (U+D800 through U+DFFF) are not legal Unicode values.
                if ((V < minV) || (V >= 0xD800 && V <= 0xDFFF)) {
                    V = Integer.MAX_VALUE;
                }

                if (V < 0x10000) {
                    C = (char) V;
                    if (!component && URI_RESERVED.indexOf(C) >= 0) {
                        for (int j = start; j != k; j++) {
                            sb.append(string.charAt(j));
                        }
                    } else {
                        sb.append(C);
                    }
                } else { // V >= 0x10000
                    if (V > 0x10FFFF) {
                        return error(string, k);
                    }
                    final int L = ((V - 0x10000) & 0x3FF) + 0xDC00;
                    final int H = (((V - 0x10000) >> 10) & 0x3FF) + 0xD800;
                    sb.append((char) H);
                    sb.append((char) L);
                }
            }
        }

        return sb.toString();
    }

    private static int hexDigit(final char ch) {
        final char chu = Character.toUpperCase(ch);
        if (chu >= '0' && chu <= '9') {
            return (chu - '0');
        } else if (chu >= 'A' && chu <= 'F') {
            return (chu - 'A' + 10);
        } else {
            return -1;
        }
    }

    private static int toHexByte(final char ch1, final char ch2) {
        final int i1 = hexDigit(ch1);
        final int i2 = hexDigit(ch2);
        if (i1 >= 0 && i2 >= 0) {
            return (i1 << 4) | i2;
        }
        return -1;
    }

    private static String toHexEscape(final int u0) {
        int u = u0;
        int len;
        final byte[] b = new byte[6];

        if (u <= 0x7f) {
            b[0] = (byte) u;
            len = 1;
        } else {
            // > 0x7ff -> length 2
            // > 0xffff -> length 3
            // and so on. each new length is an additional 5 bits from the
            // original 11
            // the final mask is 8-len zeros in the low part.
            len = 2;
            for (int mask = u >>> 11; mask != 0; mask >>>= 5) {
                len++;
            }
            for (int i = len - 1; i > 0; i--) {
                b[i] = (byte) (0x80 | (u & 0x3f));
                u >>>= 6; // 64 bits per octet.
            }

            b[0] = (byte) (~((1 << (8 - len)) - 1) | u);
        }

        final StringBuilder sb = new StringBuilder();
        for (int i = 0; i < len; i++) {
            sb.append('%');
            if ((b[i] & 0xff) < 0x10) {
                sb.append('0');
            }
            sb.append(Integer.toHexString(b[i] & 0xff).toUpperCase());
        }

        return sb.toString();
    }

    private static String error(final String string, final int index) {
        throw uriError("bad.uri", string, Integer.toString(index));
    }

    // 'uriEscaped' except for alphanumeric chars
    private static final String URI_UNESCAPED_NONALPHANUMERIC = "-_.!~*'()";
    // 'uriReserved' + '#'
    private static final String URI_RESERVED = ";/?:@&=+$,#";

    private static boolean isUnescaped(final char ch, final boolean component) {
        if (('A' <= ch && ch <= 'Z') || ('a' <= ch && ch <= 'z')
                || ('0' <= ch && ch <= '9')) {
            return true;
        }

        if (URI_UNESCAPED_NONALPHANUMERIC.indexOf(ch) >= 0) {
            return true;
        }

        if (!component) {
            return URI_RESERVED.indexOf(ch) >= 0;
        }

        return false;
    }
}