*
* The MIT License (MIT)
*
- * Copyright (C) 2017 Kevin Lamonte
+ * Copyright (C) 2019 Kevin Lamonte
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
package jexer.bits;
import java.util.List;
-import java.util.LinkedList;
+import java.util.ArrayList;
+import java.util.Arrays;
/**
* StringUtils contains methods to:
*
* - Unescape C0 control codes.
*
+ * - Read/write a line of RFC4180 comma-separated values strings to/from a
+ * list of strings.
+ *
+ * - Compute number of visible text cells for a given Unicode codepoint or
+ * string.
+ *
+ * - Convert bytes to and from base-64 encoding.
*/
-public final class StringUtils {
+public class StringUtils {
/**
* Left-justify a string into a list of lines.
* @return the list of lines
*/
public static List<String> left(final String str, final int n) {
- List<String> result = new LinkedList<String>();
+ List<String> result = new ArrayList<String>();
/*
* General procedure:
// We have just transitioned from a word to
// whitespace. See if we have enough space to add
// the word to the line.
- if (word.length() + line.length() > n) {
+ if (width(word.toString()) + width(line.toString()) > n) {
// This word will exceed the line length. Wrap
// at it instead.
result.add(line.toString());
line = new StringBuilder();
}
if ((word.toString().startsWith(" "))
- && (line.length() == 0)
+ && (width(line.toString()) == 0)
) {
line.append(word.substring(1));
} else {
}
} // for (int j = 0; j < rawLines[i].length(); j++)
- if (word.length() + line.length() > n) {
+ if (width(word.toString()) + width(line.toString()) > n) {
// This word will exceed the line length. Wrap at it
// instead.
result.add(line.toString());
line = new StringBuilder();
}
if ((word.toString().startsWith(" "))
- && (line.length() == 0)
+ && (width(line.toString()) == 0)
) {
line.append(word.substring(1));
} else {
* @return the list of lines
*/
public static List<String> right(final String str, final int n) {
- List<String> result = new LinkedList<String>();
+ List<String> result = new ArrayList<String>();
/*
* Same as left(), but preceed each line with spaces to make it n
List<String> lines = left(str, n);
for (String line: lines) {
StringBuilder sb = new StringBuilder();
- for (int i = 0; i < n - line.length(); i++) {
+ for (int i = 0; i < n - width(line); i++) {
sb.append(' ');
}
sb.append(line);
* @return the list of lines
*/
public static List<String> center(final String str, final int n) {
- List<String> result = new LinkedList<String>();
+ List<String> result = new ArrayList<String>();
/*
* Same as left(), but preceed/succeed each line with spaces to make
List<String> lines = left(str, n);
for (String line: lines) {
StringBuilder sb = new StringBuilder();
- int l = (n - line.length()) / 2;
- int r = n - line.length() - l;
+ int l = (n - width(line)) / 2;
+ int r = n - width(line) - l;
for (int i = 0; i < l; i++) {
sb.append(' ');
}
* @return the list of lines
*/
public static List<String> full(final String str, final int n) {
- List<String> result = new LinkedList<String>();
+ List<String> result = new ArrayList<String>();
/*
* Same as left(), but insert spaces between words to make each line
return sb.toString();
}
+ /**
+ * Read a line of RFC4180 comma-separated values (CSV) into a list of
+ * strings.
+ *
+ * @param line the CSV line, with or without without line terminators
+ * @return the list of strings
+ */
+ public static List<String> fromCsv(final String line) {
+ List<String> result = new ArrayList<String>();
+
+ StringBuilder str = new StringBuilder();
+ boolean quoted = false;
+ boolean fieldQuoted = false;
+
+ for (int i = 0; i < line.length(); i++) {
+ char ch = line.charAt(i);
+
+ /*
+ System.err.println("ch '" + ch + "' str '" + str + "' " +
+ " fieldQuoted " + fieldQuoted + " quoted " + quoted);
+ */
+
+ if (ch == ',') {
+ if (fieldQuoted && quoted) {
+ // Terminating a quoted field.
+ result.add(str.toString());
+ str = new StringBuilder();
+ quoted = false;
+ fieldQuoted = false;
+ } else if (fieldQuoted) {
+ // Still waiting to see the terminating quote for this
+ // field.
+ str.append(ch);
+ } else if (quoted) {
+ // An unmatched double-quote and comma. This should be
+ // an invalid sequence. We will treat it as a quote
+ // terminating the field.
+ str.append('\"');
+ result.add(str.toString());
+ str = new StringBuilder();
+ quoted = false;
+ fieldQuoted = false;
+ } else {
+ // A field separator.
+ result.add(str.toString());
+ str = new StringBuilder();
+ quoted = false;
+ fieldQuoted = false;
+ }
+ continue;
+ }
+
+ if (ch == '\"') {
+ if ((str.length() == 0) && (!fieldQuoted)) {
+ // The opening quote to a quoted field.
+ fieldQuoted = true;
+ } else if (quoted) {
+ // This is a double-quote.
+ str.append('\"');
+ quoted = false;
+ } else {
+ // This is the beginning of a quote.
+ quoted = true;
+ }
+ continue;
+ }
+
+ // Normal character, pass it on.
+ str.append(ch);
+ }
+
+ // Include the final field.
+ result.add(str.toString());
+
+ return result;
+ }
+
+ /**
+ * Write a list of strings to on line of RFC4180 comma-separated values
+ * (CSV).
+ *
+ * @param list the list of strings
+ * @return the CSV line, without any line terminators
+ */
+ public static String toCsv(final List<String> list) {
+ StringBuilder result = new StringBuilder();
+ int i = 0;
+ for (String str: list) {
+
+ if (!str.contains("\"") && !str.contains(",")) {
+ // Just append the string with a comma.
+ result.append(str);
+ } else if (!str.contains("\"") && str.contains(",")) {
+ // Contains commas, but no quotes. Just double-quote it.
+ result.append("\"");
+ result.append(str);
+ result.append("\"");
+ } else if (str.contains("\"")) {
+ // Contains quotes and maybe commas. Double-quote it and
+ // replace quotes inside.
+ result.append("\"");
+ for (int j = 0; j < str.length(); j++) {
+ char ch = str.charAt(j);
+ result.append(ch);
+ if (ch == '\"') {
+ result.append("\"");
+ }
+ }
+ result.append("\"");
+ }
+
+ if (i < list.size() - 1) {
+ result.append(",");
+ }
+ i++;
+ }
+ return result.toString();
+ }
+
+ /**
+ * Determine display width of a Unicode code point.
+ *
+ * @param ch the code point, can be char
+ * @return the number of text cell columns required to display this code
+ * point, one of 0, 1, or 2
+ */
+ public static int width(final int ch) {
+ /*
+ * This routine is a modified version of mk_wcwidth() available
+ * at: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
+ *
+ * The combining characters list has been omitted from this
+ * implementation. Hopefully no users will be impacted.
+ */
+
+ // 8-bit control characters: width 0
+ if (ch == 0) {
+ return 0;
+ }
+ if ((ch < 32) || ((ch >= 0x7f) && (ch < 0xa0))) {
+ return 0;
+ }
+
+ // All others: either 1 or 2
+ if ((ch >= 0x1100)
+ && ((ch <= 0x115f)
+ // Hangul Jamo init. consonants
+ || (ch == 0x2329)
+ || (ch == 0x232a)
+ // CJK ... Yi
+ || ((ch >= 0x2e80) && (ch <= 0xa4cf) && (ch != 0x303f))
+ // Hangul Syllables
+ || ((ch >= 0xac00) && (ch <= 0xd7a3))
+ // CJK Compatibility Ideographs
+ || ((ch >= 0xf900) && (ch <= 0xfaff))
+ // Vertical forms
+ || ((ch >= 0xfe10) && (ch <= 0xfe19))
+ // CJK Compatibility Forms
+ || ((ch >= 0xfe30) && (ch <= 0xfe6f))
+ // Fullwidth Forms
+ || ((ch >= 0xff00) && (ch <= 0xff60))
+ || ((ch >= 0xffe0) && (ch <= 0xffe6))
+ || ((ch >= 0x20000) && (ch <= 0x2fffd))
+ || ((ch >= 0x30000) && (ch <= 0x3fffd))
+ // emoji
+ || ((ch >= 0x1f004) && (ch <= 0x1fffd))
+ )
+ ) {
+ return 2;
+ }
+ return 1;
+ }
+
+ /**
+ * Determine display width of a string. This ASSUMES that no characters
+ * are combining. Hopefully no users will be impacted.
+ *
+ * @param str the string
+ * @return the number of text cell columns required to display this string
+ */
+ public static int width(final String str) {
+ int n = 0;
+ for (int i = 0; i < str.length();) {
+ int ch = str.codePointAt(i);
+ n += width(ch);
+ i += Character.charCount(ch);
+ }
+ return n;
+ }
+
+ /**
+ * Check if character is in the CJK range.
+ *
+ * @param ch character to check
+ * @return true if this character is in the CJK range
+ */
+ public static boolean isCjk(final int ch) {
+ return ((ch >= 0x2e80) && (ch <= 0x9fff));
+ }
+
+ /**
+ * Check if character is in the emoji range.
+ *
+ * @param ch character to check
+ * @return true if this character is in the emoji range
+ */
+ public static boolean isEmoji(final int ch) {
+ return ((ch >= 0x1f004) && (ch <= 0x1fffd));
+ }
+
+ // ------------------------------------------------------------------------
+ // Base64 -----------------------------------------------------------------
+ // ------------------------------------------------------------------------
+
+ /*
+ * The Base64 encoder/decoder below is provided to support JDK 1.6 - JDK
+ * 11. It was taken from https://sourceforge.net/projects/migbase64/
+ *
+ * The following changes were made:
+ *
+ * - Code has been indented and long lines cut to fit within 80 columns.
+ *
+ * - Char, String, and "fast" byte functions removed. byte versions
+ * retained and called toBase64()/fromBase64().
+ *
+ * - Enclosing braces added to blocks.
+ */
+
+ /**
+ * A very fast and memory efficient class to encode and decode to and
+ * from BASE64 in full accordance with RFC 2045.<br><br> On Windows XP
+ * sp1 with 1.4.2_04 and later ;), this encoder and decoder is about 10
+ * times faster on small arrays (10 - 1000 bytes) and 2-3 times as fast
+ * on larger arrays (10000 - 1000000 bytes) compared to
+ * <code>sun.misc.Encoder()/Decoder()</code>.<br><br>
+ *
+ * On byte arrays the encoder is about 20% faster than Jakarta Commons
+ * Base64 Codec for encode and about 50% faster for decoding large
+ * arrays. This implementation is about twice as fast on very small
+ * arrays (< 30 bytes). If source/destination is a <code>String</code>
+ * this version is about three times as fast due to the fact that the
+ * Commons Codec result has to be recoded to a <code>String</code> from
+ * <code>byte[]</code>, which is very expensive.<br><br>
+ *
+ * This encode/decode algorithm doesn't create any temporary arrays as
+ * many other codecs do, it only allocates the resulting array. This
+ * produces less garbage and it is possible to handle arrays twice as
+ * large as algorithms that create a temporary array. (E.g. Jakarta
+ * Commons Codec). It is unknown whether Sun's
+ * <code>sun.misc.Encoder()/Decoder()</code> produce temporary arrays but
+ * since performance is quite low it probably does.<br><br>
+ *
+ * The encoder produces the same output as the Sun one except that the
+ * Sun's encoder appends a trailing line separator if the last character
+ * isn't a pad. Unclear why but it only adds to the length and is
+ * probably a side effect. Both are in conformance with RFC 2045
+ * though.<br> Commons codec seem to always att a trailing line
+ * separator.<br><br>
+ *
+ * <b>Note!</b> The encode/decode method pairs (types) come in three
+ * versions with the <b>exact</b> same algorithm and thus a lot of code
+ * redundancy. This is to not create any temporary arrays for transcoding
+ * to/from different format types. The methods not used can simply be
+ * commented out.<br><br>
+ *
+ * There is also a "fast" version of all decode methods that works the
+ * same way as the normal ones, but har a few demands on the decoded
+ * input. Normally though, these fast verions should be used if the
+ * source if the input is known and it hasn't bee tampered with.<br><br>
+ *
+ * If you find the code useful or you find a bug, please send me a note
+ * at base64 @ miginfocom . com.
+ *
+ * Licence (BSD):
+ * ==============
+ *
+ * Copyright (c) 2004, Mikael Grev, MiG InfoCom AB. (base64 @ miginfocom
+ * . com) All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * Neither the name of the MiG InfoCom AB nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @version 2.2
+ * @author Mikael Grev
+ * Date: 2004-aug-02
+ * Time: 11:31:11
+ */
+
+ private static final char[] CA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".toCharArray();
+ private static final int[] IA = new int[256];
+ static {
+ Arrays.fill(IA, -1);
+ for (int i = 0, iS = CA.length; i < iS; i++) {
+ IA[CA[i]] = i;
+ }
+ IA['='] = 0;
+ }
+
+ /**
+ * Encodes a raw byte array into a BASE64 <code>byte[]</code>
+ * representation i accordance with RFC 2045.
+ * @param sArr The bytes to convert. If <code>null</code> or length 0
+ * an empty array will be returned.
+ * @return A BASE64 encoded array. Never <code>null</code>.
+ */
+ public final static String toBase64(byte[] sArr) {
+ // Check special case
+ int sLen = sArr != null ? sArr.length : 0;
+ if (sLen == 0) {
+ return "";
+ }
+
+ final boolean lineSep = true;
+
+ int eLen = (sLen / 3) * 3; // Length of even 24-bits.
+ int cCnt = ((sLen - 1) / 3 + 1) << 2; // Returned character count
+ int dLen = cCnt + (lineSep ? (cCnt - 1) / 76 << 1 : 0); // Length of returned array
+ byte[] dArr = new byte[dLen];
+
+ // Encode even 24-bits
+ for (int s = 0, d = 0, cc = 0; s < eLen;) {
+ // Copy next three bytes into lower 24 bits of int, paying
+ // attension to sign.
+ int i = (sArr[s++] & 0xff) << 16 | (sArr[s++] & 0xff) << 8 | (sArr[s++] & 0xff);
+
+ // Encode the int into four chars
+ dArr[d++] = (byte) CA[(i >>> 18) & 0x3f];
+ dArr[d++] = (byte) CA[(i >>> 12) & 0x3f];
+ dArr[d++] = (byte) CA[(i >>> 6) & 0x3f];
+ dArr[d++] = (byte) CA[i & 0x3f];
+
+ // Add optional line separator
+ if (lineSep && ++cc == 19 && d < dLen - 2) {
+ dArr[d++] = '\r';
+ dArr[d++] = '\n';
+ cc = 0;
+ }
+ }
+
+ // Pad and encode last bits if source isn't an even 24 bits.
+ int left = sLen - eLen; // 0 - 2.
+ if (left > 0) {
+ // Prepare the int
+ int i = ((sArr[eLen] & 0xff) << 10) | (left == 2 ? ((sArr[sLen - 1] & 0xff) << 2) : 0);
+
+ // Set last four chars
+ dArr[dLen - 4] = (byte) CA[i >> 12];
+ dArr[dLen - 3] = (byte) CA[(i >>> 6) & 0x3f];
+ dArr[dLen - 2] = left == 2 ? (byte) CA[i & 0x3f] : (byte) '=';
+ dArr[dLen - 1] = '=';
+ }
+ try {
+ return new String(dArr, "UTF-8");
+ } catch (java.io.UnsupportedEncodingException e) {
+ throw new IllegalArgumentException(e);
+ }
+
+ }
+
+ /**
+ * Decodes a BASE64 encoded byte array. All illegal characters will
+ * be ignored and can handle both arrays with and without line
+ * separators.
+ * @param sArr The source array. Length 0 will return an empty
+ * array. <code>null</code> will throw an exception.
+ * @return The decoded array of bytes. May be of length 0. Will be
+ * <code>null</code> if the legal characters (including '=') isn't
+ * divideable by 4. (I.e. definitely corrupted).
+ */
+ public final static byte[] fromBase64(byte[] sArr) {
+ // Check special case
+ int sLen = sArr.length;
+
+ // Count illegal characters (including '\r', '\n') to know what
+ // size the returned array will be, so we don't have to
+ // reallocate & copy it later.
+ int sepCnt = 0; // Number of separator characters. (Actually illegal characters, but that's a bonus...)
+ for (int i = 0; i < sLen; i++) {
+ // If input is "pure" (I.e. no line separators or illegal chars)
+ // base64 this loop can be commented out.
+ if (IA[sArr[i] & 0xff] < 0) {
+ sepCnt++;
+ }
+ }
+
+ // Check so that legal chars (including '=') are evenly
+ // divideable by 4 as specified in RFC 2045.
+ if ((sLen - sepCnt) % 4 != 0) {
+ return null;
+ }
+
+ int pad = 0;
+ for (int i = sLen; i > 1 && IA[sArr[--i] & 0xff] <= 0;) {
+ if (sArr[i] == '=') {
+ pad++;
+ }
+ }
+
+ int len = ((sLen - sepCnt) * 6 >> 3) - pad;
+
+ byte[] dArr = new byte[len]; // Preallocate byte[] of exact length
+
+ for (int s = 0, d = 0; d < len;) {
+ // Assemble three bytes into an int from four "valid" characters.
+ int i = 0;
+ for (int j = 0; j < 4; j++) { // j only increased if a valid char was found.
+ int c = IA[sArr[s++] & 0xff];
+ if (c >= 0) {
+ i |= c << (18 - j * 6);
+ } else {
+ j--;
+ }
+ }
+
+ // Add the bytes
+ dArr[d++] = (byte) (i >> 16);
+ if (d < len) {
+ dArr[d++]= (byte) (i >> 8);
+ if (d < len) {
+ dArr[d++] = (byte) i;
+ }
+ }
+ }
+
+ return dArr;
+ }
+
}