X-Git-Url: http://git.nikiroo.be/?a=blobdiff_plain;f=src%2Fjexer%2Fbits%2FStringUtils.java;h=d33f71f4e0031710e52ed008fefa0f37f28883c4;hb=c4cefaa04ec122fc02efb6542451a31fdf722c32;hp=a98756ed22950a5f3eb359f36f24438448078d0e;hpb=656c0dddc7c0faddd62d373f22916107d322429e;p=fanfix.git

diff --git a/src/jexer/bits/StringUtils.java b/src/jexer/bits/StringUtils.java
index a98756e..d33f71f 100644
--- a/src/jexer/bits/StringUtils.java
+++ b/src/jexer/bits/StringUtils.java
@@ -30,6 +30,7 @@ package jexer.bits;
 
 import java.util.List;
 import java.util.ArrayList;
+import java.util.Arrays;
 
 /**
  * StringUtils contains methods to:
@@ -41,6 +42,11 @@ import java.util.ArrayList;
  *
  *    - Read/write a line of RFC4180 comma-separated values strings to/from a
  *      list of strings.
+ *
+ *    - Compute number of visible text cells for a given Unicode codepoint or
+ *      string.
+ *
+ *    - Convert bytes to and from base-64 encoding.
  */
 public class StringUtils {
 
@@ -80,14 +86,14 @@ public class StringUtils {
                         // We have just transitioned from a word to
                         // whitespace.  See if we have enough space to add
                         // the word to the line.
-                        if (word.length() + line.length() > n) {
+                        if (width(word.toString()) + width(line.toString()) > n) {
                             // This word will exceed the line length.  Wrap
                             // at it instead.
                             result.add(line.toString());
                             line = new StringBuilder();
                         }
                         if ((word.toString().startsWith(" "))
-                            && (line.length() == 0)
+                            && (width(line.toString()) == 0)
                         ) {
                             line.append(word.substring(1));
                         } else {
@@ -112,14 +118,14 @@ public class StringUtils {
                 }
             } // for (int j = 0; j < rawLines[i].length(); j++)
 
-            if (word.length() + line.length() > n) {
+            if (width(word.toString()) + width(line.toString()) > n) {
                 // This word will exceed the line length.  Wrap at it
                 // instead.
                 result.add(line.toString());
                 line = new StringBuilder();
             }
             if ((word.toString().startsWith(" "))
-                && (line.length() == 0)
+                && (width(line.toString()) == 0)
             ) {
                 line.append(word.substring(1));
             } else {
@@ -148,7 +154,7 @@ public class StringUtils {
         List<String> lines = left(str, n);
         for (String line: lines) {
             StringBuilder sb = new StringBuilder();
-            for (int i = 0; i < n - line.length(); i++) {
+            for (int i = 0; i < n - width(line); i++) {
                 sb.append(' ');
             }
             sb.append(line);
@@ -175,8 +181,8 @@ public class StringUtils {
         List<String> lines = left(str, n);
         for (String line: lines) {
             StringBuilder sb = new StringBuilder();
-            int l = (n - line.length()) / 2;
-            int r = n - line.length() - l;
+            int l = (n - width(line)) / 2;
+            int r = n - width(line) - l;
             for (int i = 0; i < l; i++) {
                 sb.append(' ');
             }
@@ -404,4 +410,336 @@ public class StringUtils {
         return result.toString();
     }
 
+    /**
+     * Determine display width of a Unicode code point.
+     *
+     * @param ch the code point, can be char
+     * @return the number of text cell columns required to display this code
+     * point, one of 0, 1, or 2
+     */
+    public static int width(final int ch) {
+        /*
+         * This routine is a modified version of mk_wcwidth() available
+         * at: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
+         *
+         * The combining characters list has been omitted from this
+         * implementation.  Hopefully no users will be impacted.
+         */
+
+        // 8-bit control characters: width 0
+        if (ch == 0) {
+            return 0;
+        }
+        if ((ch < 32) || ((ch >= 0x7f) && (ch < 0xa0))) {
+            return 0;
+        }
+
+        // All others: either 1 or 2
+        if ((ch >= 0x1100)
+            && ((ch <= 0x115f)
+                // Hangul Jamo init. consonants
+                || (ch == 0x2329)
+                || (ch == 0x232a)
+                // CJK ... Yi
+                || ((ch >= 0x2e80) && (ch <= 0xa4cf) && (ch != 0x303f))
+                // Hangul Syllables
+                || ((ch >= 0xac00) && (ch <= 0xd7a3))
+                // CJK Compatibility Ideographs
+                || ((ch >= 0xf900) && (ch <= 0xfaff))
+                // Vertical forms
+                || ((ch >= 0xfe10) && (ch <= 0xfe19))
+                // CJK Compatibility Forms
+                || ((ch >= 0xfe30) && (ch <= 0xfe6f))
+                // Fullwidth Forms
+                || ((ch >= 0xff00) && (ch <= 0xff60))
+                || ((ch >= 0xffe0) && (ch <= 0xffe6))
+                || ((ch >= 0x20000) && (ch <= 0x2fffd))
+                || ((ch >= 0x30000) && (ch <= 0x3fffd))
+                // emoji
+                || ((ch >= 0x1f004) && (ch <= 0x1fffd))
+            )
+        ) {
+            return 2;
+        }
+        return 1;
+    }
+
+    /**
+     * Determine display width of a string.  This ASSUMES that no characters
+     * are combining.  Hopefully no users will be impacted.
+     *
+     * @param str the string
+     * @return the number of text cell columns required to display this string
+     */
+    public static int width(final String str) {
+        if (str == null) {
+            return 0;
+        }
+
+        int n = 0;
+        for (int i = 0; i < str.length();) {
+            int ch = str.codePointAt(i);
+            n += width(ch);
+            i += Character.charCount(ch);
+        }
+        return n;
+    }
+
+    /**
+     * Check if character is in the CJK range.
+     *
+     * @param ch character to check
+     * @return true if this character is in the CJK range
+     */
+    public static boolean isCjk(final int ch) {
+        return ((ch >= 0x2e80) && (ch <= 0x9fff));
+    }
+
+    /**
+     * Check if character is in the emoji range.
+     *
+     * @param ch character to check
+     * @return true if this character is in the emoji range
+     */
+    public static boolean isEmoji(final int ch) {
+        return ((ch >= 0x1f004) && (ch <= 0x1fffd));
+    }
+
+    // ------------------------------------------------------------------------
+    // Base64 -----------------------------------------------------------------
+    // ------------------------------------------------------------------------
+
+    /*
+     * The Base64 encoder/decoder below is provided to support JDK 1.6 - JDK
+     * 11.  It was taken from https://sourceforge.net/projects/migbase64/
+     *
+     * The following changes were made:
+     *
+     * - Code has been indented and long lines cut to fit within 80 columns.
+     *
+     * - Char, String, and "fast" byte functions removed.  byte versions
+     *   retained and called toBase64()/fromBase64().
+     *
+     * - Enclosing braces added to blocks.
+     */
+
+    /**
+     * A very fast and memory efficient class to encode and decode to and
+     * from BASE64 in full accordance with RFC 2045.<br><br> On Windows XP
+     * sp1 with 1.4.2_04 and later ;), this encoder and decoder is about 10
+     * times faster on small arrays (10 - 1000 bytes) and 2-3 times as fast
+     * on larger arrays (10000 - 1000000 bytes) compared to
+     * <code>sun.misc.Encoder()/Decoder()</code>.<br><br>
+     *
+     * On byte arrays the encoder is about 20% faster than Jakarta Commons
+     * Base64 Codec for encode and about 50% faster for decoding large
+     * arrays. This implementation is about twice as fast on very small
+     * arrays (&lt 30 bytes). If source/destination is a <code>String</code>
+     * this version is about three times as fast due to the fact that the
+     * Commons Codec result has to be recoded to a <code>String</code> from
+     * <code>byte[]</code>, which is very expensive.<br><br>
+     *
+     * This encode/decode algorithm doesn't create any temporary arrays as
+     * many other codecs do, it only allocates the resulting array. This
+     * produces less garbage and it is possible to handle arrays twice as
+     * large as algorithms that create a temporary array. (E.g. Jakarta
+     * Commons Codec). It is unknown whether Sun's
+     * <code>sun.misc.Encoder()/Decoder()</code> produce temporary arrays but
+     * since performance is quite low it probably does.<br><br>
+     *
+     * The encoder produces the same output as the Sun one except that the
+     * Sun's encoder appends a trailing line separator if the last character
+     * isn't a pad. Unclear why but it only adds to the length and is
+     * probably a side effect. Both are in conformance with RFC 2045
+     * though.<br> Commons codec seem to always att a trailing line
+     * separator.<br><br>
+     *
+     * <b>Note!</b> The encode/decode method pairs (types) come in three
+     * versions with the <b>exact</b> same algorithm and thus a lot of code
+     * redundancy. This is to not create any temporary arrays for transcoding
+     * to/from different format types. The methods not used can simply be
+     * commented out.<br><br>
+     *
+     * There is also a "fast" version of all decode methods that works the
+     * same way as the normal ones, but har a few demands on the decoded
+     * input. Normally though, these fast verions should be used if the
+     * source if the input is known and it hasn't bee tampered with.<br><br>
+     *
+     * If you find the code useful or you find a bug, please send me a note
+     * at base64 @ miginfocom . com.
+     *
+     * Licence (BSD):
+     * ==============
+     *
+     * Copyright (c) 2004, Mikael Grev, MiG InfoCom AB. (base64 @ miginfocom
+     * . com) All rights reserved.
+     *
+     * Redistribution and use in source and binary forms, with or without
+     * modification, are permitted provided that the following conditions are
+     * met: Redistributions of source code must retain the above copyright
+     * notice, this list of conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright
+     * notice, this list of conditions and the following disclaimer in the
+     * documentation and/or other materials provided with the distribution.
+     * Neither the name of the MiG InfoCom AB nor the names of its
+     * contributors may be used to endorse or promote products derived from
+     * this software without specific prior written permission.
+     *
+     * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+     * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+     * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+     * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
+     * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+     * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+     * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+     * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+     * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+     * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+     * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     *
+     * @version 2.2
+     * @author Mikael Grev
+     *         Date: 2004-aug-02
+     *         Time: 11:31:11
+     */
+
+    private static final char[] CA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".toCharArray();
+    private static final int[] IA = new int[256];
+    static {
+        Arrays.fill(IA, -1);
+        for (int i = 0, iS = CA.length; i < iS; i++) {
+            IA[CA[i]] = i;
+        }
+        IA['='] = 0;
+    }
+
+    /**
+     * Encodes a raw byte array into a BASE64 <code>byte[]</code>
+     * representation i accordance with RFC 2045.
+     * @param sArr The bytes to convert. If <code>null</code> or length 0
+     * an empty array will be returned.
+     * @return A BASE64 encoded array. Never <code>null</code>.
+     */
+    public final static String toBase64(byte[] sArr) {
+        // Check special case
+        int sLen = sArr != null ? sArr.length : 0;
+        if (sLen == 0) {
+            return "";
+        }
+
+        final boolean lineSep = true;
+
+        int eLen = (sLen / 3) * 3;                              // Length of even 24-bits.
+        int cCnt = ((sLen - 1) / 3 + 1) << 2;                   // Returned character count
+        int dLen = cCnt + (lineSep ? (cCnt - 1) / 76 << 1 : 0); // Length of returned array
+        byte[] dArr = new byte[dLen];
+
+        // Encode even 24-bits
+        for (int s = 0, d = 0, cc = 0; s < eLen;) {
+            // Copy next three bytes into lower 24 bits of int, paying
+            // attension to sign.
+            int i = (sArr[s++] & 0xff) << 16 | (sArr[s++] & 0xff) << 8 | (sArr[s++] & 0xff);
+
+            // Encode the int into four chars
+            dArr[d++] = (byte) CA[(i >>> 18) & 0x3f];
+            dArr[d++] = (byte) CA[(i >>> 12) & 0x3f];
+            dArr[d++] = (byte) CA[(i >>> 6) & 0x3f];
+            dArr[d++] = (byte) CA[i & 0x3f];
+
+            // Add optional line separator
+            if (lineSep && ++cc == 19 && d < dLen - 2) {
+                dArr[d++] = '\r';
+                dArr[d++] = '\n';
+                cc = 0;
+            }
+        }
+
+        // Pad and encode last bits if source isn't an even 24 bits.
+        int left = sLen - eLen; // 0 - 2.
+        if (left > 0) {
+            // Prepare the int
+            int i = ((sArr[eLen] & 0xff) << 10) | (left == 2 ? ((sArr[sLen - 1] & 0xff) << 2) : 0);
+
+            // Set last four chars
+            dArr[dLen - 4] = (byte) CA[i >> 12];
+            dArr[dLen - 3] = (byte) CA[(i >>> 6) & 0x3f];
+            dArr[dLen - 2] = left == 2 ? (byte) CA[i & 0x3f] : (byte) '=';
+            dArr[dLen - 1] = '=';
+        }
+        try {
+            return new String(dArr, "UTF-8");
+        } catch (java.io.UnsupportedEncodingException e) {
+            throw new IllegalArgumentException(e);
+        }
+
+    }
+
+    /**
+     * Decodes a BASE64 encoded byte array. All illegal characters will
+     * be ignored and can handle both arrays with and without line
+     * separators.
+     * @param sArr The source array. Length 0 will return an empty
+     * array. <code>null</code> will throw an exception.
+     * @return The decoded array of bytes. May be of length 0. Will be
+     * <code>null</code> if the legal characters (including '=') isn't
+     * divideable by 4. (I.e. definitely corrupted).
+     */
+    public final static byte[] fromBase64(byte[] sArr) {
+        // Check special case
+        int sLen = sArr.length;
+
+        // Count illegal characters (including '\r', '\n') to know what
+        // size the returned array will be, so we don't have to
+        // reallocate & copy it later.
+        int sepCnt = 0; // Number of separator characters. (Actually illegal characters, but that's a bonus...)
+        for (int i = 0; i < sLen; i++) {
+            // If input is "pure" (I.e. no line separators or illegal chars)
+            // base64 this loop can be commented out.
+            if (IA[sArr[i] & 0xff] < 0) {
+                sepCnt++;
+            }
+        }
+
+        // Check so that legal chars (including '=') are evenly
+        // divideable by 4 as specified in RFC 2045.
+        if ((sLen - sepCnt) % 4 != 0) {
+            return null;
+        }
+
+        int pad = 0;
+        for (int i = sLen; i > 1 && IA[sArr[--i] & 0xff] <= 0;) {
+            if (sArr[i] == '=') {
+                pad++;
+            }
+        }
+
+        int len = ((sLen - sepCnt) * 6 >> 3) - pad;
+
+        byte[] dArr = new byte[len];       // Preallocate byte[] of exact length
+
+        for (int s = 0, d = 0; d < len;) {
+            // Assemble three bytes into an int from four "valid" characters.
+            int i = 0;
+            for (int j = 0; j < 4; j++) {   // j only increased if a valid char was found.
+                int c = IA[sArr[s++] & 0xff];
+                if (c >= 0) {
+                    i |= c << (18 - j * 6);
+                } else {
+                    j--;
+                }
+            }
+
+            // Add the bytes
+            dArr[d++] = (byte) (i >> 16);
+            if (d < len) {
+                dArr[d++]= (byte) (i >> 8);
+                if (d < len) {
+                    dArr[d++] = (byte) i;
+                }
+            }
+        }
+
+        return dArr;
+    }
+
 }