X-Git-Url: http://git.nikiroo.be/?a=blobdiff_plain;f=src%2Fjexer%2Fbits%2FStringUtils.java;h=d33f71f4e0031710e52ed008fefa0f37f28883c4;hb=c4cefaa04ec122fc02efb6542451a31fdf722c32;hp=a98756ed22950a5f3eb359f36f24438448078d0e;hpb=656c0dddc7c0faddd62d373f22916107d322429e;p=fanfix.git diff --git a/src/jexer/bits/StringUtils.java b/src/jexer/bits/StringUtils.java index a98756e..d33f71f 100644 --- a/src/jexer/bits/StringUtils.java +++ b/src/jexer/bits/StringUtils.java @@ -30,6 +30,7 @@ package jexer.bits; import java.util.List; import java.util.ArrayList; +import java.util.Arrays; /** * StringUtils contains methods to: @@ -41,6 +42,11 @@ import java.util.ArrayList; * * - Read/write a line of RFC4180 comma-separated values strings to/from a * list of strings. + * + * - Compute number of visible text cells for a given Unicode codepoint or + * string. + * + * - Convert bytes to and from base-64 encoding. */ public class StringUtils { @@ -80,14 +86,14 @@ public class StringUtils { // We have just transitioned from a word to // whitespace. See if we have enough space to add // the word to the line. - if (word.length() + line.length() > n) { + if (width(word.toString()) + width(line.toString()) > n) { // This word will exceed the line length. Wrap // at it instead. result.add(line.toString()); line = new StringBuilder(); } if ((word.toString().startsWith(" ")) - && (line.length() == 0) + && (width(line.toString()) == 0) ) { line.append(word.substring(1)); } else { @@ -112,14 +118,14 @@ public class StringUtils { } } // for (int j = 0; j < rawLines[i].length(); j++) - if (word.length() + line.length() > n) { + if (width(word.toString()) + width(line.toString()) > n) { // This word will exceed the line length. Wrap at it // instead. result.add(line.toString()); line = new StringBuilder(); } if ((word.toString().startsWith(" ")) - && (line.length() == 0) + && (width(line.toString()) == 0) ) { line.append(word.substring(1)); } else { @@ -148,7 +154,7 @@ public class StringUtils { List lines = left(str, n); for (String line: lines) { StringBuilder sb = new StringBuilder(); - for (int i = 0; i < n - line.length(); i++) { + for (int i = 0; i < n - width(line); i++) { sb.append(' '); } sb.append(line); @@ -175,8 +181,8 @@ public class StringUtils { List lines = left(str, n); for (String line: lines) { StringBuilder sb = new StringBuilder(); - int l = (n - line.length()) / 2; - int r = n - line.length() - l; + int l = (n - width(line)) / 2; + int r = n - width(line) - l; for (int i = 0; i < l; i++) { sb.append(' '); } @@ -404,4 +410,336 @@ public class StringUtils { return result.toString(); } + /** + * Determine display width of a Unicode code point. + * + * @param ch the code point, can be char + * @return the number of text cell columns required to display this code + * point, one of 0, 1, or 2 + */ + public static int width(final int ch) { + /* + * This routine is a modified version of mk_wcwidth() available + * at: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c + * + * The combining characters list has been omitted from this + * implementation. Hopefully no users will be impacted. + */ + + // 8-bit control characters: width 0 + if (ch == 0) { + return 0; + } + if ((ch < 32) || ((ch >= 0x7f) && (ch < 0xa0))) { + return 0; + } + + // All others: either 1 or 2 + if ((ch >= 0x1100) + && ((ch <= 0x115f) + // Hangul Jamo init. consonants + || (ch == 0x2329) + || (ch == 0x232a) + // CJK ... Yi + || ((ch >= 0x2e80) && (ch <= 0xa4cf) && (ch != 0x303f)) + // Hangul Syllables + || ((ch >= 0xac00) && (ch <= 0xd7a3)) + // CJK Compatibility Ideographs + || ((ch >= 0xf900) && (ch <= 0xfaff)) + // Vertical forms + || ((ch >= 0xfe10) && (ch <= 0xfe19)) + // CJK Compatibility Forms + || ((ch >= 0xfe30) && (ch <= 0xfe6f)) + // Fullwidth Forms + || ((ch >= 0xff00) && (ch <= 0xff60)) + || ((ch >= 0xffe0) && (ch <= 0xffe6)) + || ((ch >= 0x20000) && (ch <= 0x2fffd)) + || ((ch >= 0x30000) && (ch <= 0x3fffd)) + // emoji + || ((ch >= 0x1f004) && (ch <= 0x1fffd)) + ) + ) { + return 2; + } + return 1; + } + + /** + * Determine display width of a string. This ASSUMES that no characters + * are combining. Hopefully no users will be impacted. + * + * @param str the string + * @return the number of text cell columns required to display this string + */ + public static int width(final String str) { + if (str == null) { + return 0; + } + + int n = 0; + for (int i = 0; i < str.length();) { + int ch = str.codePointAt(i); + n += width(ch); + i += Character.charCount(ch); + } + return n; + } + + /** + * Check if character is in the CJK range. + * + * @param ch character to check + * @return true if this character is in the CJK range + */ + public static boolean isCjk(final int ch) { + return ((ch >= 0x2e80) && (ch <= 0x9fff)); + } + + /** + * Check if character is in the emoji range. + * + * @param ch character to check + * @return true if this character is in the emoji range + */ + public static boolean isEmoji(final int ch) { + return ((ch >= 0x1f004) && (ch <= 0x1fffd)); + } + + // ------------------------------------------------------------------------ + // Base64 ----------------------------------------------------------------- + // ------------------------------------------------------------------------ + + /* + * The Base64 encoder/decoder below is provided to support JDK 1.6 - JDK + * 11. It was taken from https://sourceforge.net/projects/migbase64/ + * + * The following changes were made: + * + * - Code has been indented and long lines cut to fit within 80 columns. + * + * - Char, String, and "fast" byte functions removed. byte versions + * retained and called toBase64()/fromBase64(). + * + * - Enclosing braces added to blocks. + */ + + /** + * A very fast and memory efficient class to encode and decode to and + * from BASE64 in full accordance with RFC 2045.

On Windows XP + * sp1 with 1.4.2_04 and later ;), this encoder and decoder is about 10 + * times faster on small arrays (10 - 1000 bytes) and 2-3 times as fast + * on larger arrays (10000 - 1000000 bytes) compared to + * sun.misc.Encoder()/Decoder().

+ * + * On byte arrays the encoder is about 20% faster than Jakarta Commons + * Base64 Codec for encode and about 50% faster for decoding large + * arrays. This implementation is about twice as fast on very small + * arrays (< 30 bytes). If source/destination is a String + * this version is about three times as fast due to the fact that the + * Commons Codec result has to be recoded to a String from + * byte[], which is very expensive.

+ * + * This encode/decode algorithm doesn't create any temporary arrays as + * many other codecs do, it only allocates the resulting array. This + * produces less garbage and it is possible to handle arrays twice as + * large as algorithms that create a temporary array. (E.g. Jakarta + * Commons Codec). It is unknown whether Sun's + * sun.misc.Encoder()/Decoder() produce temporary arrays but + * since performance is quite low it probably does.

+ * + * The encoder produces the same output as the Sun one except that the + * Sun's encoder appends a trailing line separator if the last character + * isn't a pad. Unclear why but it only adds to the length and is + * probably a side effect. Both are in conformance with RFC 2045 + * though.
Commons codec seem to always att a trailing line + * separator.

+ * + * Note! The encode/decode method pairs (types) come in three + * versions with the exact same algorithm and thus a lot of code + * redundancy. This is to not create any temporary arrays for transcoding + * to/from different format types. The methods not used can simply be + * commented out.

+ * + * There is also a "fast" version of all decode methods that works the + * same way as the normal ones, but har a few demands on the decoded + * input. Normally though, these fast verions should be used if the + * source if the input is known and it hasn't bee tampered with.

+ * + * If you find the code useful or you find a bug, please send me a note + * at base64 @ miginfocom . com. + * + * Licence (BSD): + * ============== + * + * Copyright (c) 2004, Mikael Grev, MiG InfoCom AB. (base64 @ miginfocom + * . com) All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * Neither the name of the MiG InfoCom AB nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @version 2.2 + * @author Mikael Grev + * Date: 2004-aug-02 + * Time: 11:31:11 + */ + + private static final char[] CA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".toCharArray(); + private static final int[] IA = new int[256]; + static { + Arrays.fill(IA, -1); + for (int i = 0, iS = CA.length; i < iS; i++) { + IA[CA[i]] = i; + } + IA['='] = 0; + } + + /** + * Encodes a raw byte array into a BASE64 byte[] + * representation i accordance with RFC 2045. + * @param sArr The bytes to convert. If null or length 0 + * an empty array will be returned. + * @return A BASE64 encoded array. Never null. + */ + public final static String toBase64(byte[] sArr) { + // Check special case + int sLen = sArr != null ? sArr.length : 0; + if (sLen == 0) { + return ""; + } + + final boolean lineSep = true; + + int eLen = (sLen / 3) * 3; // Length of even 24-bits. + int cCnt = ((sLen - 1) / 3 + 1) << 2; // Returned character count + int dLen = cCnt + (lineSep ? (cCnt - 1) / 76 << 1 : 0); // Length of returned array + byte[] dArr = new byte[dLen]; + + // Encode even 24-bits + for (int s = 0, d = 0, cc = 0; s < eLen;) { + // Copy next three bytes into lower 24 bits of int, paying + // attension to sign. + int i = (sArr[s++] & 0xff) << 16 | (sArr[s++] & 0xff) << 8 | (sArr[s++] & 0xff); + + // Encode the int into four chars + dArr[d++] = (byte) CA[(i >>> 18) & 0x3f]; + dArr[d++] = (byte) CA[(i >>> 12) & 0x3f]; + dArr[d++] = (byte) CA[(i >>> 6) & 0x3f]; + dArr[d++] = (byte) CA[i & 0x3f]; + + // Add optional line separator + if (lineSep && ++cc == 19 && d < dLen - 2) { + dArr[d++] = '\r'; + dArr[d++] = '\n'; + cc = 0; + } + } + + // Pad and encode last bits if source isn't an even 24 bits. + int left = sLen - eLen; // 0 - 2. + if (left > 0) { + // Prepare the int + int i = ((sArr[eLen] & 0xff) << 10) | (left == 2 ? ((sArr[sLen - 1] & 0xff) << 2) : 0); + + // Set last four chars + dArr[dLen - 4] = (byte) CA[i >> 12]; + dArr[dLen - 3] = (byte) CA[(i >>> 6) & 0x3f]; + dArr[dLen - 2] = left == 2 ? (byte) CA[i & 0x3f] : (byte) '='; + dArr[dLen - 1] = '='; + } + try { + return new String(dArr, "UTF-8"); + } catch (java.io.UnsupportedEncodingException e) { + throw new IllegalArgumentException(e); + } + + } + + /** + * Decodes a BASE64 encoded byte array. All illegal characters will + * be ignored and can handle both arrays with and without line + * separators. + * @param sArr The source array. Length 0 will return an empty + * array. null will throw an exception. + * @return The decoded array of bytes. May be of length 0. Will be + * null if the legal characters (including '=') isn't + * divideable by 4. (I.e. definitely corrupted). + */ + public final static byte[] fromBase64(byte[] sArr) { + // Check special case + int sLen = sArr.length; + + // Count illegal characters (including '\r', '\n') to know what + // size the returned array will be, so we don't have to + // reallocate & copy it later. + int sepCnt = 0; // Number of separator characters. (Actually illegal characters, but that's a bonus...) + for (int i = 0; i < sLen; i++) { + // If input is "pure" (I.e. no line separators or illegal chars) + // base64 this loop can be commented out. + if (IA[sArr[i] & 0xff] < 0) { + sepCnt++; + } + } + + // Check so that legal chars (including '=') are evenly + // divideable by 4 as specified in RFC 2045. + if ((sLen - sepCnt) % 4 != 0) { + return null; + } + + int pad = 0; + for (int i = sLen; i > 1 && IA[sArr[--i] & 0xff] <= 0;) { + if (sArr[i] == '=') { + pad++; + } + } + + int len = ((sLen - sepCnt) * 6 >> 3) - pad; + + byte[] dArr = new byte[len]; // Preallocate byte[] of exact length + + for (int s = 0, d = 0; d < len;) { + // Assemble three bytes into an int from four "valid" characters. + int i = 0; + for (int j = 0; j < 4; j++) { // j only increased if a valid char was found. + int c = IA[sArr[s++] & 0xff]; + if (c >= 0) { + i |= c << (18 - j * 6); + } else { + j--; + } + } + + // Add the bytes + dArr[d++] = (byte) (i >> 16); + if (d < len) { + dArr[d++]= (byte) (i >> 8); + if (d < len) { + dArr[d++] = (byte) i; + } + } + } + + return dArr; + } + }