001    /*
002    Copyright (c) 1996-2012, Damon Hart-Davis
003    All rights reserved.
004    
005    Redistribution and use in source and binary forms, with or without
006    modification, are permitted provided that the following conditions are
007    met:
008    
009      * Redistributions of source code must retain the above copyright
010        notice, this list of conditions and the following disclaimer.
011    
012      * Redistributions in binary form must reproduce the above copyright
013        notice, this list of conditions and the following disclaimer in the
014        documentation and/or other materials provided with the
015        distribution.
016    
017    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028    */
029    package org.hd.d.pg2k.svrCore;
030    
031    import java.util.Collection;
032    import java.util.Comparator;
033    import java.util.SortedSet;
034    import java.util.TreeSet;
035    import java.util.concurrent.atomic.AtomicReference;
036    import java.util.regex.Matcher;
037    import java.util.regex.Pattern;
038    
039    import org.w3c.dom.Element;
040    import org.w3c.dom.NamedNodeMap;
041    import org.w3c.dom.Node;
042    
043    /**Some simple common text utilities of scope throughout the application.
044     */
045    public final class TextUtils
046        {
047        /**Prevent construction of an instance. */
048        private TextUtils() { }
049    
050        /**Suffixes used by sizeAsText. */
051        private static final String sizeSuffixes[] = { "B", "kB", "MB", "GB", "TB", "EB" };
052        /**Renders size in bytes as text, abbreviated if requested.
053         * Can produce normal or abbreviated output.
054         */
055        public static final String sizeAsText(final long size, final boolean abbrev)
056            {
057            final StringBuilder sb = new StringBuilder(abbrev ? 5 : 16);
058            if(abbrev)
059                {
060                long divisor = 1;
061                for(int i = 0; i < sizeSuffixes.length; ++i, divisor *= 1024)
062                    {
063                    if(size < divisor * 1000)
064                        {
065                        // Takes 3 or less digits with this suffix, so show it!
066                        sb.append((size + divisor/2) / divisor);
067                        sb.append(sizeSuffixes[i]);
068                        return(sb.toString()); // Done.
069                        }
070                    }
071                // We may fall through to the default case...
072                }
073            sb.append(size);
074            sb.append("Byte");
075            return(sb.toString());
076            }
077    
078        /**Returns true if given String is 7-bit clean, is is pure ASCII, or is null.
079         */
080        public static boolean isASCII7(final String s)
081            {
082            if(s == null) { return(true); }
083            for(int i = s.length(); --i >= 0; )
084                { if((s.charAt(i) & ~0x7f) != 0) { return(false); } }
085            return(true); // Seems OK.
086            }
087    
088        /**Recursively copy second Node and contents into first node.
089         * This works even when the second Node is from a different Document to the first
090         * and importNode() does not work (eg due to JDK1.5 impl bug).
091         */
092        public static final void importCopy(final Node dest, final Node src)
093            {
094            if((dest == null) || (src == null))
095                { throw new IllegalArgumentException(); }
096    
097            final Element cp = dest.getOwnerDocument().createElement(src.getNodeName());
098    
099            // Deal with attrs, etc.
100            final boolean hasAttributes = (src.getNodeType() == Node.ELEMENT_NODE) &&
101                    (src.getAttributes().getLength() > 0);
102            if(hasAttributes)
103                {
104                final NamedNodeMap attribs = src.getAttributes();
105                for(int i = 0; i < attribs.getLength(); i++)
106                    { cp.setAttribute(attribs.item(i).getNodeName(), attribs.item(i).getNodeValue()); }
107                }
108    
109            // Copy in any node value.
110            cp.setNodeValue(src.getNodeValue());
111    
112            // Now add the new node to the destination tree.
113            dest.appendChild(cp);
114    
115            // Recursively copy children.
116            for(Node child = src.getFirstChild(); child != null; child = child.getNextSibling())
117                { importCopy(cp, child); }
118            }
119    
120        /**String to write as line terminator when converting to XML.
121         * A simple LF would suit UNIX-like systems
122         * though would probably be accepted by most.
123         * A bulkier CRLF is more like the Internet "standard".
124         */
125        private static final String XML_LINE_TERM = "\r\n";
126    
127        /**Write DOM tree as XML/XHTML String; never "" nor null.
128         * Can write as terse/efficient single-line form or longer more human-friendly format.
129         * <p>
130         * When writing XHTML we make text nodes XML/HTML-safe, though allow embedded entities.
131         *
132         * @param node  DOM tree root; never null
133         * @param toXHTML  if true write with formatting suitable to include directly in HTML/XHTML for human consumption
134         *     (eg using dl/ul/ol nested lists to produce formatted text representing the structure)
135         *     rather than a pure XML representation of the Node tree itself
136         * @param terse  if true write as compactly as possible, else make more human-readable if possible
137         */
138        public static final String toXML(final Node node, final boolean toXHTML, final boolean terse)
139            {
140            final StringBuilder result = new StringBuilder(64);
141            toXML(result, node, toXHTML, terse);
142            return(result.toString());
143            }
144    
145        /**Write DOM tree as XML/XHTML String; appends to supplied StringBuilder.
146         * Can write as terse/efficient single-line form or longer more human-friendly format.
147         * <p>
148         * When writing XHTML we make text nodes XML/HTML-safe, though allow embedded entities.
149         *
150         * @param node  DOM tree root; never null
151         * @param toXHTML  if true write with formatting suitable to include directly in HTML/XHTML for human consumption
152         *     (eg using dl/ul/ol nested lists to produce formatted text representing the structure)
153         *     rather than a pure XML representation of the Node tree itself
154         * @param terse  if true write as compactly as possible, else make more human-readable if possible
155         */
156        public static final void toXML(final StringBuilder result,
157                                       final Node node, final boolean toXHTML, final boolean terse)
158            {
159            if((node == null) || (result == null))
160                { throw new IllegalArgumentException(); }
161    
162            final String nodeValue = node.getNodeValue();
163    
164            final boolean hasValue = (node.getNodeValue() != null);
165    
166            final boolean hasChildren = (node.getFirstChild() != null);
167    
168            final boolean hasAttributes = (node.getNodeType() == Node.ELEMENT_NODE) &&
169                    (node.getAttributes().getLength() > 0);
170    
171            if(toXHTML) // XHTML/HTML formatted text.
172                {
173                result.append("<dl compact><dt>");
174                result.append("<b>").append(node.getNodeName()).append("</b>");
175    
176                if(hasAttributes)
177                    {
178                    final NamedNodeMap attribs = node.getAttributes();
179                    for(int i = 0; i < attribs.getLength(); i++)
180                        {
181                        result.append(' ');
182                        result.append(attribs.item(i).getNodeName());
183                        result.append("=\"");
184                        result.append(escapeHTMLMetaChars(attribs.item(i).getNodeValue()));
185                        result.append('"');
186                        }
187                    }
188    
189                // Write any value with < > & " ' escaped...
190                if(hasValue) { result.append(" [").append(escapeHTMLMetaChars(nodeValue.trim())).append("] "); }
191    
192                // Recursively deal with any child nodes...
193                if(hasChildren)
194                    {
195                    result.append("<dd>");
196                    for(Node child = node.getFirstChild(); child != null; child = child.getNextSibling())
197                        { toXML(result, child, toXHTML, terse); }
198                    }
199    
200                result.append("</dl>");
201                }
202            else // Pure XML.
203                {
204                result.append('<').append(node.getNodeName());
205    
206                if(hasAttributes)
207                    {
208                    final NamedNodeMap attribs = node.getAttributes();
209                    for(int i = 0; i < attribs.getLength(); i++)
210                        {
211                        result.append(' ');
212                        result.append(attribs.item(i).getNodeName());
213                        result.append("=\"");
214                        result.append(escapeHTMLMetaChars(attribs.item(i).getNodeValue()));
215                        result.append('"');
216                        }
217                    }
218    
219                // If we have no value or children
220                // then we can use the <.../> format with no separate closing tag.
221                final boolean unitag = (!hasValue && !hasChildren);
222    
223                if(unitag) { result.append('/'); }
224    
225                result.append('>');
226    
227                if(!unitag)
228                    {
229                    if(hasValue) { result.append(escapeHTMLMetaChars(nodeValue.trim())); }
230    
231                    for(Node child = node.getFirstChild(); child != null; child = child.getNextSibling())
232                        { result.append(toXML(child, toXHTML, terse)); }
233    
234                    result.append("</").append(node.getNodeName()).append('>');
235                    }
236                }
237    
238            if(!terse) { result.append(XML_LINE_TERM); } // End the line for readability...
239            }
240    
241        /**Rewrite HTML so that it displays as "raw" text and is safe to use in attribute values.
242         * Meta-characters (&lt;, &gt;, &amp; &quot; ') are rewritten
243         * to entity codes so that when shown on screen it looks like
244         * the raw HTML text when displayed in-line in HTML,
245         * and so as to avoid problems when embedded in XML and HTML.
246         * <p>
247         * If the input text does not contain these meta-characters
248         * then it is returned unchanged.
249         * <p>
250         * Additionally, all characters &lt; 32 (ASCII control characters)
251         * are converted to spaces.
252         * <p>
253         * If null is passed in, then this returns null,
254         * to simplify its use in some cases where a null might be present.
255         *
256         * @param in  String possibly containing HTML/XML meta-characters
257         */
258        public static String escapeHTMLMetaChars(final String in)
259            {
260            if(in == null) { return(null); }
261    
262            // Check for "meta" characters in input.
263            // If none found then we can return the input String as-is.
264            boolean foundMetaChar = false;
265            for(int i = in.length(); --i >= 0; )
266                {
267                final char c = in.charAt(i);
268                if((c < 32) || ("<>\"'&".indexOf(c) != -1))
269                    {
270                    foundMetaChar = true;
271                    break;
272                    }
273                }
274            if(!foundMetaChar) { return(in); }
275    
276            // Take a working writable copy of the input.
277            final StringBuilder work = new StringBuilder(in);
278    
279            // Work backwards through the string
280            // (for efficiency and simplicity)
281            // carefully escaping the meta-characters.
282            for(int i = work.length(); --i >= 0; )
283                {
284                final char c = work.charAt(i);
285                String replacement;
286                switch(c)
287                    {
288                    case '<': { replacement =  "&lt;"; break; }
289                    case '>': { replacement =  "&gt;"; break; }
290                    case '&': { replacement =  "&amp;"; break; }
291                    case '"': { replacement =  "&quot;"; break; }
292                    case '\'': { replacement =  "&#039;"; break; }
293                    default:
294                        {
295                        // Replace ASCII control codes...
296                        if(c < 32) { replacement = " "; break; }
297    
298                        // No replacement needed.
299                        continue;
300                        }
301                    }
302    
303                // Delete the old character.
304                work.deleteCharAt(i);
305                // Insert the new string
306                // (we could merge these two ops for efficiency).
307                work.insert(i, replacement);
308                }
309    
310            return(work.toString());
311            }
312    
313        /**Private to encode8To6()/decode8To6(); automagically created on first access. */
314        private static final class Base64Cache
315            {
316            /**Prevent construction of any instance. */
317            private Base64Cache() {}
318    
319            /**Private to encode8To6()/decode8To6(). */
320            private static final char[] alphabet = {
321                'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',  //  0 to  7
322                'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',  //  8 to 15
323                'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',  // 16 to 23
324                'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',  // 24 to 31
325                'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',  // 32 to 39
326                'o', 'p', 'q', 'r', 's', 't', 'u', 'v',  // 40 to 47
327                'w', 'x', 'y', 'z', '0', '1', '2', '3',  // 48 to 55
328                '4', '5', '6', '7', '8', '9', '+', '/'}; // 56 to 63
329    
330            /**Index/value of each legit 7-bit-ASCII character from the alphabet.
331             * All other (ie illegal) values map to -1.
332             * <p>
333             * This is effectively the reverse mapping for alphabet[].
334             */
335            private static final int valueOf[] = new int[128];
336    
337            /**Initialise valueOf[]. */
338            static
339                {
340                for(int i = valueOf.length; --i >= 0; ) { valueOf[i] = -1; }
341                for(int i = alphabet.length; --i >= 0; ) { valueOf[alphabet[i]] = i; }
342                }
343    
344            /**Gets valueOf given char; non-negative.
345             * @throws IllegalArgumentException for invalid char (not from allowed alphabet)
346             * @return value in range 0--63 inclusive
347             */
348            public static int getValueOf(final char c)
349                {
350                if(c > 128) { throw new IllegalArgumentException(); }
351                final int v = valueOf[c];
352                if(v == -1) { throw new IllegalArgumentException(); }
353                return(v);
354                }
355            }
356    
357        /**Decode to a byte array (8 bit) from ASCII Base-64 (6 bit); never null.
358         * The input data must always be a multiple of 4 characters,
359         * and all character must come from the standard Base64 set [A-Za-z0-9+/=].
360         *
361         * @param base64Text  data in base-46, eg as encoded by encode8To6(); never null
362         * @return binary data; never null
363         */
364        public static byte[] decode8To6(final String base64Text)
365            {
366            if(base64Text == null) { throw new IllegalArgumentException(); }
367            final int inputLen = base64Text.length();
368            // Handle empty input specially.
369            if(inputLen == 0) { return(new byte[0]); }
370            // Check input is legitimate length (eg appropriately padded with trailing '='s).
371            if((inputLen & 3) != 0)
372                { throw new IllegalArgumentException("input length must be multiple of 4: was "+base64Text.length()); }
373    
374            // Compute the maximum result length (absent trailing '='s).
375            final int maxResultSize = 3 * (inputLen / 4);
376            // Compute the actual result size.
377            final boolean oneTrailingEquals = (base64Text.charAt(inputLen-1) == '=');
378            final boolean twoTrailingEquals = oneTrailingEquals && (base64Text.charAt(inputLen-2) == '=');
379            final int resultSize = twoTrailingEquals ? (maxResultSize-2) :
380                                       (oneTrailingEquals ? (maxResultSize-1) :
381                                           maxResultSize);
382            // Make the array for the final result...
383            final byte result[] = new byte[resultSize];
384    
385            // Decoding TO block of 3 bytes at index i in the result
386            // FROM block of 4 characters at index j in the source.
387            for(int i = 0, j = 0; i < resultSize; i += 3, j += 4)
388                {
389                // Special treatment for the last block if it is partial.
390                final boolean partialLastBlock = ((i+3) > resultSize);
391    
392                // Always the first two chars of input block must be real encoded data.
393                final int v0 = Base64Cache.getValueOf(base64Text.charAt(j+0));
394                final int v1 = Base64Cache.getValueOf(base64Text.charAt(j+1));
395                // Always at least one output byte.
396                result[i+0] = (byte)((v0 << 2)         + (v1 >>> 4)); // b0 = v0v0v0v0 v0v0v1v1
397    
398                if(!partialLastBlock)
399                    {
400                    // Full block; two more bytes to decode from three more input chars.
401                    final int v2 = Base64Cache.getValueOf(base64Text.charAt(j+2));
402                    final int v3 = Base64Cache.getValueOf(base64Text.charAt(j+3));
403                    result[i+1] = (byte)(((v1 & 0xf) << 4) + (v2 >>> 2)); // b1 = v1v1v1v1 v2v2v2v2
404                    result[i+2] = (byte)(((v2 & 3) << 6)   +  v3       ); // b2 = v2v2v3v3 v3v3v3v3
405                    }
406                else if(!twoTrailingEquals)
407                    {
408                    // Partial last block with more than one byte to decode from one more input char.
409                    final int v2 = Base64Cache.getValueOf(base64Text.charAt(j+2));
410                    result[i+1] = (byte)(((v1 & 0xf) << 4) + (v2 >>> 2)); // b1 = v1v1v1v1 v2v2v2v2
411                    }
412                }
413    
414            return(result);
415            }
416    
417        /**Encode a byte array (8 bit) in ASCII Base-64 (6 bit); never null.
418         * With thanks to Apache.
419         * @param data8Bit  binary input data; never null
420         * @return ASCII base-64 representation as String; never null
421         */
422        public static String encode8To6(final byte[] data8Bit)
423            {
424            final StringBuilder sb = new StringBuilder(data8Bit.length * 2);
425    
426            int i = 0; // Index in data8Bit[].
427            while((i + 3) <= data8Bit.length)
428                {
429                int bits24 = (data8Bit[i++] & 0xFF) << 16;
430                bits24 |= (data8Bit[i++] & 0xFF) << 8;
431                bits24 |= (data8Bit[i++] & 0xFF);
432                sb.append(Base64Cache.alphabet[(bits24 & 0x00FC0000) >> 18]);
433                sb.append(Base64Cache.alphabet[(bits24 & 0x0003F000) >> 12]);
434                sb.append(Base64Cache.alphabet[(bits24 & 0x00000FC0) >> 6]);
435                sb.append(Base64Cache.alphabet[(bits24 & 0x0000003F)]);
436                }
437    
438            if(data8Bit.length - i == 2)
439                {
440                int bits24 = (data8Bit[i] & 0xFF) << 16;
441                bits24 |= (data8Bit[i + 1] & 0xFF) << 8;
442                sb.append(Base64Cache.alphabet[(bits24 & 0x00FC0000) >> 18]);
443                sb.append(Base64Cache.alphabet[(bits24 & 0x0003F000) >> 12]);
444                sb.append(Base64Cache.alphabet[(bits24 & 0x00000FC0) >> 6]);
445                sb.append('=');
446                }
447            else if(data8Bit.length - i == 1)
448                {
449                final int bits24 = (data8Bit[i] & 0xFF) << 16;
450                sb.append(Base64Cache.alphabet[(bits24 & 0x00FC0000) >> 18]);
451                sb.append(Base64Cache.alphabet[(bits24 & 0x0003F000) >> 12]);
452                sb.append('=');
453                sb.append('=');
454                }
455    
456            return(sb.toString());
457            }
458    
459        /**Sanitise text for XHTML/HTML/WML/XML use.
460         * The transformations are:
461         * <ul>
462         * <li>A null argument is converted to an empty string.
463         * <li>Whitespace is trimmed from the ends.
464         * <li>Converts any ``&lt;'' to ``{,'' ``&gt;'' to ``}''.
465         * <li>Unless in "allowEntities" modes, converts any ``&amp;'' to ``+'',
466         *     else leaves syntactically-correct entities intact and counts
467         *     as a single character.
468         * <li>Any non-ASCII characters are converted to a safe value;
469         *     if entity codes are permitted then these values are converted
470         *     to entity codes.
471         * <li>The result is limited to at most the specified number of characters.
472         * </ul>
473         * <p>
474         * Our simple definition of a syntactically-correct entity is that
475         * it may contain any selection of ASCII digits and letters and optionally
476         * a leading hash (`#'), but certainly no whitespace.  We may enforce a
477         * length limit.
478         * <p>
479         * If the input string is OK it is returned untouched.
480         * <p>
481         * TODO: Needs version with extra argument to allow entity sequences
482         *     (of the form '&amp;'xxx;) to be treated as single characters
483         *     and to delete any '&lt;'...'&gt;' sequences,
484         *     ie to do a more sophisticated job of sanitising XML/HTML
485         *     that has some simple markup (primarily entity codes needed for i18n).
486         * TODO: jUnit tests
487         *
488         * @param maxLen  if positive the output is limited to at most
489         *     this number of characters;
490         *     if &gt;= 3 truncated text is marked with a trailing ``...''
491         *     in the last three positions
492         * @param allowEntities  if true, treats HTML/XML entity sequences
493         *     as if single characters; the entities are vaguely tested for
494         *     correct syntax and may not be allowed if invalid
495         */
496        public static String sanitiseForXML(final String input,
497                                            final int maxLen,
498                                            final boolean allowEntities)
499            {
500            if(maxLen < 0) { throw new IllegalArgumentException(); }
501    
502            if(input == null) { return(""); }
503    
504            // First trim whitespace from either end.
505            String in = input.trim();
506    
507            // Always remove potentially-unsafe tag meta-characters.
508            in = in.replace('<', '{').replace('>', '}');
509    
510            // If non-null, the input string has had all entities replaced with
511            // just the leading '&' with the entity code
512            // (not including the trailing ';')
513            // at the matching index in this array,
514            // so all entities will appear as a single character.
515            String entities[] = null;
516    
517            // If not allowing entities zap all '&' chars...
518            if(!allowEntities)
519                { in = in.replace('&', '+'); }
520            // If we are allowing entities, and there are any,
521            // convert them all to placeholders.
522            else if(in.indexOf('&') >= 0)
523                {
524                // in may get truncated but not longer while this array is in use.
525                entities = new String[in.length()];
526    
527                // Break out the (valid) entities one at a time...
528                for(int nextAmp = -1; (nextAmp = in.indexOf('&', nextAmp + 1)) != -1; )
529                    {
530                    // Look for the next semicolon...
531                    final int nextSc = in.indexOf(';', nextAmp + 1);
532                    if(nextSc == -1)
533                        {
534                        // Whoops; if no more semicolons, then
535                        // all ampersands from current are bogus and we can stop.
536                        in = ((nextAmp >= 0) ? in.substring(0, nextAmp) : "") +
537                            (in.substring(nextAmp).replace('&', '+'));
538                        break;
539                        }
540    
541    
542                    // TODO: do more validity checking of a putative entity:
543                    // check that it letters and digits or a hash and all digits.
544    
545    
546                    // Now assume that the entity is valid,
547                    // so break it out into the entities array
548                    // leaving the '&' behind
549                    // and putting the body (not the trailing ';')
550                    // in the entities[] array at the '&' index.
551                    entities[nextAmp] = in.substring(nextAmp+1, nextSc);
552                    in = in.substring(0, nextAmp+1) +
553                         in.substring(nextSc+1);
554                    }
555                }
556    
557            // Truncate if necessary while
558            // any entities that we care about are "broken out"...
559            if((maxLen >= 0) && (in.length() > maxLen))
560                {
561                if(maxLen >= 3) { in = in.substring(0, maxLen-3) + "..."; }
562                else { in = in.substring(0, maxLen); }
563                }
564    
565            // If we broke out any entities,
566            // put any remaining ones back in-line.
567            if(entities != null)
568                {
569                final StringBuilder sb = new StringBuilder(in.length()); // At least this long.
570    
571                final int len = in.length();
572                for(int i = 0; i < len; ++i)
573                    {
574                    final char c = in.charAt(i);
575                    sb.append(c);
576                    if(c != '&')
577                        { continue; } // Usual case; no entity involved.
578    
579                    // Expand an entity.
580                    assert(entities[i] != null);
581                    sb.append(entities[i]).append(';');
582                    }
583    
584                // Convert back to expanded string.
585                in = sb.toString();
586    
587    //            entities = null; // Allow GC.
588                }
589    
590            // Look for any char outwith range 32--126.
591            // If one is found then replace every occurrence with ' ' and retrim at end,
592            // unless allowing entities in which case replace with an entity code.
593            for(int i = in.length(); --i >= 0; )
594                {
595                final char c = in.charAt(i);
596                if((c < 32) || (c > 126))
597                    {
598                    // Whole string needs fixing...
599                    final StringBuilder sb = new StringBuilder(2 * in.length());
600                    for(int j = 0; j < in.length(); ++j)
601                        {
602                        final char d = in.charAt(j);
603                        if((d < 32) || (d > 126))
604                            {
605                            // If allowing entities,
606                            // convert non-ASCII chars to entities.
607                            if(!allowEntities) { sb.append(' '); }
608                            else { sb.append("&#").append(0xffff & d).append(';'); }
609                            }
610                        else { sb.append(d); }
611                        }
612                    in = sb.toString();
613                    if(!allowEntities) { in = in.trim(); }
614                    break;
615                    }
616                }
617    
618            if(input.equals(in)) { return(input); } // Avoid making copies!
619            return(in);
620            }
621    
622    
623        /**Extension for a CharSequence that holds only 8-bit character values.
624         * The implementation of this may often hold the underlying data as a byte array.
625         */
626        public static interface CharSequence8Bit extends CharSequence
627            {
628            /**Return private copy (8-bit) text with each byte corresponding to one char; never null.
629             * May be far more efficient than converting via a String
630             * if the aim is to write directly to a file for example.
631             */
632            public byte[] toByteArray();
633    
634            /**Returns byte value at given index.
635             * Like charAt(), but avoids explicit conversion to char.
636             */
637            public byte byteAt(final int i);
638            }
639    
640        /**Extension (marker interface) for a CharSequence that holds only 7-bit character values.
641         * The implementation of this may often hold the underlying data as a byte array.
642         */
643        public static interface CharSequence7Bit extends CharSequence8Bit { }
644    
645        /**Orders CharSequence objects as if by String.compareTo(); not null.
646         * If both arguments to compare() are the same reference (even null) this returns 0 immediately.
647         */
648        public static final Comparator<CharSequence> CASE_SENSITIVE_ORDER = new Comparator<CharSequence>(){
649            public int compare(final CharSequence cs1, final CharSequence cs2)
650                {
651                if(cs1 == cs2) { return(0); } // Optimisation...
652                final int l1 = cs1.length();
653                final int l2 = cs2.length();
654                final int l = Math.min(l1, l2);
655                for(int i = 0; i < l; ++i)
656                    {
657                    final char c1 = cs1.charAt(i);
658                    final char c2 = cs2.charAt(i);
659                    if(c1 != c2)
660                        { return(c1 - c2); }
661                    }
662                return(l1 - l2);
663                }
664            };
665    
666        /**Orders CharSequence objects as if by String.compareToIgnoreCase(); not null.
667         * If both arguments to compare() are the same reference (even null) this returns 0 immediately.
668         */
669        public static final Comparator<CharSequence> CASE_INSENSITIVE_ORDER = new Comparator<CharSequence>(){
670            public int compare(final CharSequence cs1, final CharSequence cs2)
671                {
672                if(cs1 == cs2) { return(0); } // Optimisation...
673                final int l1 = cs1.length();
674                final int l2 = cs2.length();
675                final int l = Math.min(l1, l2);
676                for(int i = 0; i < l; ++i)
677                    {
678                    char c1 = cs1.charAt(i);
679                    char c2 = cs2.charAt(i);
680                    if(c1 != c2)
681                        {
682                        c1 = Character.toUpperCase(c1);
683                        c2 = Character.toUpperCase(c2);
684                        if(c1 != c2)
685                            {
686                            c1 = Character.toLowerCase(c1);
687                            c2 = Character.toLowerCase(c2);
688                            if (c1 != c2)
689                                { return(c1 - c2); }
690                            }
691                        }
692                    }
693                return(l1 - l2);
694                }
695            };
696    
697        /**First index of specified character in given (non-null) CharSequence as for String.
698         * @return index of first occurrence of c, else -1 if no occurrence of c
699         */
700        public static final int indexOf(final CharSequence cs, final char c)
701            { return(TextUtils.indexOf(cs, c, 0)); }
702    
703        /**First index of specified character in given (non-null) CharSequence from/after specified index as for String.
704         * @return index of first occurrence of c, else -1 if no occurrence of c
705         */
706        public static final int indexOf(final CharSequence cs, final char c, final int fromIndex)
707            {
708            if(cs == null) { throw new IllegalArgumentException(); }
709            final int len = cs.length();
710            for(int i = Math.max(0, fromIndex); i < len; ++i)
711                { if(c == cs.charAt(i)) { return(i); } }
712            return(-1);
713            }
714    
715        /**Last index of specified character in given (non-null) CharSequence as for String.
716         * @return index of last occurrence of c, else -1 if no occurrence of c
717         */
718        public static final int lastIndexOf(final CharSequence cs, final char c)
719            { return(TextUtils.lastIndexOf(cs, c, Integer.MAX_VALUE)); }
720    
721        /**Last index of specified character in given (non-null) CharSequence from/before specified start index as for String.
722         * @return index of last occurrence of c, else -1 if no occurrence of c
723         */
724        public static final int lastIndexOf(final CharSequence cs, final char c, final int fromIndex)
725            {
726            if(cs == null) { throw new IllegalArgumentException(); }
727            final int len = cs.length();
728            for(int i = ((fromIndex >= len) ? len - 1 : fromIndex); --i >= 0; )
729                { if(c == cs.charAt(i)) { return(i); } }
730            return(-1);
731            }
732    
733        /**Checks that the specified region of two (non-null) CharSequences matches exactly (as for String).
734         */
735        public static boolean regionMatches(final CharSequence cs1, final int start1,
736                                            final CharSequence cs2, final int start2,
737                                            final int len)
738            {
739            if((null == cs1) || (null == cs2)) { throw new IllegalArgumentException(); }
740            if((start1 < 0) || (start2 < 0)) { throw new IllegalArgumentException(); }
741            if((start1 + len > cs1.length()) || (start2 + len > cs2.length())) { throw new IllegalArgumentException(); }
742            for(int i = len; --i >= 0; )
743                { if(cs1.charAt(start1 + i) != cs2.charAt(start2 + i)) { return(false); } }
744            return(true);
745            }
746    
747        /**Checks that two CharSequences are both null or represent the same (non-null) sequence of chars. */
748        public static boolean contentEqualsOrBothNull(final CharSequence cs1, final CharSequence cs2)
749            {
750            if(cs1 == cs2) { return(true); } // Optimisation; both null or same non-null item.
751            if((null == cs1) != (null == cs2)) { return(false); } // Only one is null.
752            final int len = cs1.length();
753            if(len != cs2.length()) { return(false); }
754            for(int i = len; --i >= 0; )
755                { if(cs1.charAt(i) != cs2.charAt(i)) { return(false); } }
756            return(true);
757            }
758    
759        /**Checks that two (non-null) 8-bit CharSequences represent the same sequence of chars/bytes. */
760        public static boolean contentEquals(final CharSequence8Bit cs1, final CharSequence8Bit cs2)
761            {
762            if((null == cs1) || (null == cs2)) { throw new IllegalArgumentException(); }
763            if(cs1 == cs2) { return(true); } // Optimisation.
764            final int len = cs1.length();
765            if(len != cs2.length()) { return(false); }
766            for(int i = len; --i >= 0; )
767                { if(cs1.byteAt(i) != cs2.byteAt(i)) { return(false); } }
768            return(true);
769            }
770    
771        /**Checks that two (non-null) CharSequences represent the same sequence of chars. */
772        public static boolean contentEquals(final CharSequence cs1, final CharSequence cs2)
773            {
774            if((null == cs1) || (null == cs2)) { throw new IllegalArgumentException(); }
775            if(cs1 == cs2) { return(true); } // Optimisation.
776            final int len = cs1.length();
777            if(len != cs2.length()) { return(false); }
778            for(int i = len; --i >= 0; )
779                { if(cs1.charAt(i) != cs2.charAt(i)) { return(false); } }
780            return(true);
781            }
782    
783        /**Checks that two (non-null) CharSequences represent the same sequence of chars, ignoring case. */
784        public static boolean contentEqualsIgnoreCase(final CharSequence cs1, final CharSequence cs2)
785            {
786            if((null == cs1) || (null == cs2)) { throw new IllegalArgumentException(); }
787            if(cs1 == cs2) { return(true); } // Optimisation.
788            final int len = cs1.length();
789            if(len != cs2.length()) { return(false); }
790            // Use our case-insensitive comparator for consistency.
791            return(0 == CASE_INSENSITIVE_ORDER.compare(cs1, cs2));
792            }
793    
794        /**Compares two (non-null) CharSequences for lexical order. */
795        public static int compare(final CharSequence cs1, final CharSequence cs2)
796            {
797            // Arg check for non-null omitted as performance-critical as of 2012/02/26, and not all JITs smart enough to elide.
798            //if((null == cs1) || (null == cs2)) { throw new IllegalArgumentException(); }
799            final int l1 = cs1.length();
800            final int l2 = cs2.length();
801            final int l = Math.min(l1, l2);
802            for(int i = 0; i < l; ++i)
803                {
804                final char c1 = cs1.charAt(i);
805                final char c2 = cs2.charAt(i);
806                if(c1 != c2) { return(c1 - c2); }
807                }
808            return(l1 - l2);
809            }
810    
811        /**Returns true if the first sequence starts with the second (neither null), else false. */
812        public static boolean startsWith(final CharSequence mainText,
813                                         final CharSequence putativePrefix)
814            {
815            final int ppl = putativePrefix.length();
816            return((mainText.length() >= ppl) &&
817                regionMatches(mainText, 0, putativePrefix, 0, ppl));
818            }
819    
820        /**Returns true if the first sequence ends with the second (neither null), else false. */
821        public static boolean endsWith(final CharSequence mainText,
822                                       final CharSequence putativeSuffix)
823            {
824            final int psl = putativeSuffix.length();
825            final int ml = mainText.length();
826            return((ml >= psl) &&
827                regionMatches(mainText, ml-psl, putativeSuffix, 0, psl));
828            }
829    
830        /**Create and populate a SortedSet of CharSequence in natural/total case-sensitive order that will work with any mix of immutable CharSequence keys; never null.
831         * This may not be consistent with equals() if keys are of mixed types.
832         * <p>
833         * The result is not thread-safe and is mutable.
834         *
835         * @param csc  initial collection to populate from; can be null to avoid initial population
836         */
837        public static <T extends CharSequence> SortedSet<T> createCharSequenceSortedSet(final Collection<T> csc)
838            { return(createCharSequenceSortedSet(csc, TextUtils.CASE_SENSITIVE_ORDER)); }
839    
840        /**Create and populate a SortedSet of CharSequence in specified order that will work with any mix of immutable CharSequence keys; never null.
841         * This may not be consistent with equals() if keys are of mixed types.
842         * <p>
843         * The result is not thread-safe and is mutable.
844         *
845         * @param csc  initial collection to populate from; can be null to avoid initial population
846         * @param comparator   used to order the items; never null
847         */
848        public static <T extends CharSequence> SortedSet<T> createCharSequenceSortedSet(final Collection<T> csc, final Comparator<CharSequence> comparator)
849            {
850            if(null == comparator) { throw new IllegalArgumentException(); }
851            final SortedSet<T> result = new TreeSet<T>(comparator);
852            if(null != csc) { result.addAll(csc); }
853            return(result);
854            }
855    
856        /**Minimum-character entity to hold as Name with its high overheads; non-negative.
857         * If smaller than this then CS8Bit may be used instead,
858         * with internal de-duping of values and keys
859         * rather than the global intern()-driven system used by Name.
860         * <p>
861         * Empirically determined by distribution of sizes of keys and values!
862         */
863        private static final int MIN_NAME_CHARS_FOR_EFFICIENT_TEXT_REPRESENTATION = 16;
864    
865        /**Make an efficient representation for possibly non-unique text to be held in memory long-term.
866         * Does not make reference to other existing values that may be held together.
867         * <p>
868         * A null input results in a null.
869         * <p>
870         * A non-8-bit (or empty) text results in a String result.
871         * <p>
872         * A short (8-bit) text will result in an non-intern()ed CS8Bit result.
873         * <p>
874         * Else an (implicitly intern()ed) Name is returned.
875         * <p>
876         * This aims to avoid intern() overhead for small values.
877         *
878         * @param value  the value to optimise
879         * @param prevRef  if non-null, used to convey Name values from one call to the next
880         *     to help provide a more-compact representation by sharing prefixes and suffixes
881         */
882        public static CharSequence makeEfficientTextRepresentation(final CharSequence value,
883                                                                   final AtomicReference<Name> prevRef)
884            {
885            if(null == value) { return(null); }
886            final int length = value.length();
887            if(0 == length) { return(""); }
888    
889            CharSequence vcs;
890            try
891                {
892                // If the value is short then use CS8Bit and don't intern() it because of overhead.
893                // If the supplied value has the correct type then use as-is,
894                // else try to convert it to the target type,
895                // falling back to String if not possible (eg non-8-bit characters present).
896                if(length < MIN_NAME_CHARS_FOR_EFFICIENT_TEXT_REPRESENTATION)
897                    {
898                    final CS8Bit c = (value.getClass() == CS8Bit.class) ? (CS8Bit) value : new CS8Bit(value);
899                    vcs = c;
900                    }
901                else
902                    {
903                    final Name n = (value.getClass() == Name.class) ? (Name) value :
904                        Name.create(value, (null != prevRef) ? prevRef.get() : null);
905                    vcs = n;
906                    prevRef.set(n);
907                    }
908                }
909            // If conversion of value to Name fails then convert to String and intern().
910            // We should only need a String form for non-8-bit values.
911            catch(final IllegalArgumentException e)
912                { vcs = MemoryTools.intern(value.toString()); }
913            return vcs;
914            }
915    
916        /**Efficient conversion from 7-bit or 8-bit text (once char per byte) to String; never null.
917         * The array passed is not altered in any way.
918         * @param ascii  text; never null but may be empty
919         * @return  new String instance; never null
920         */
921        public static String toString(final byte[] ascii)
922            { return(new String(ascii, 0)); } // The deprecated method is by far the most straightforward.
923    
924        /**Return a hash code the same as or similar to that of a String containing the same characters. */
925        public static int hashCode(final CharSequence cs)
926            {
927            final int len = cs.length();
928            int h = 0;
929            for(int i = 0; i < len; i++)
930                { h = 31 * h + cs.charAt(i); }
931            return(h);
932            }
933    
934        /**Return an ASCII printable hex hash code; never null nor empty. */
935        public static String hashCodeHexString(final CharSequence cs)
936            { return(Integer.toHexString(hashCode(cs)));}
937    
938        /**Compiled regex to match inter-word gaps in plain (mainly English) text; non-null. */
939        private static final Pattern wordBoundaryRegex = Pattern.compile("\\s+");
940    
941        /**Quickly attempt to count the words in plain text; non-negative. */
942        public static int quickWordCount(final CharSequence text)
943            {
944            int matchCount = 0;
945    
946            // Count segments before each match.
947            final Matcher m = wordBoundaryRegex.matcher(text);
948            int index = 0;
949            while(m.find())
950                {
951                final String match = text.subSequence(index, m.start()).toString();
952                if(0 != match.length()) { ++matchCount; }
953                index = m.end();
954                }
955    
956            final int length = text.length();
957    
958            // If no match was found, return 0 if string empty else 1 (assume text contains one word).
959            if (index == 0) { return((length == 0) ? 0 : 1); }
960    
961            // Add any non-empty trailing segment.
962            if(index < length) { ++matchCount; }
963    
964            return(matchCount);
965            }
966        }