001 /*
002 Copyright (c) 1996-2012, Damon Hart-Davis
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without
006 modification, are permitted provided that the following conditions are
007 met:
008
009 * Redistributions of source code must retain the above copyright
010 notice, this list of conditions and the following disclaimer.
011
012 * Redistributions in binary form must reproduce the above copyright
013 notice, this list of conditions and the following disclaimer in the
014 documentation and/or other materials provided with the
015 distribution.
016
017 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028 */
029 package org.hd.d.pg2k.svrCore;
030
031 import java.util.Collection;
032 import java.util.Comparator;
033 import java.util.SortedSet;
034 import java.util.TreeSet;
035 import java.util.concurrent.atomic.AtomicReference;
036 import java.util.regex.Matcher;
037 import java.util.regex.Pattern;
038
039 import org.w3c.dom.Element;
040 import org.w3c.dom.NamedNodeMap;
041 import org.w3c.dom.Node;
042
043 /**Some simple common text utilities of scope throughout the application.
044 */
045 public final class TextUtils
046 {
047 /**Prevent construction of an instance. */
048 private TextUtils() { }
049
050 /**Suffixes used by sizeAsText. */
051 private static final String sizeSuffixes[] = { "B", "kB", "MB", "GB", "TB", "EB" };
052 /**Renders size in bytes as text, abbreviated if requested.
053 * Can produce normal or abbreviated output.
054 */
055 public static final String sizeAsText(final long size, final boolean abbrev)
056 {
057 final StringBuilder sb = new StringBuilder(abbrev ? 5 : 16);
058 if(abbrev)
059 {
060 long divisor = 1;
061 for(int i = 0; i < sizeSuffixes.length; ++i, divisor *= 1024)
062 {
063 if(size < divisor * 1000)
064 {
065 // Takes 3 or less digits with this suffix, so show it!
066 sb.append((size + divisor/2) / divisor);
067 sb.append(sizeSuffixes[i]);
068 return(sb.toString()); // Done.
069 }
070 }
071 // We may fall through to the default case...
072 }
073 sb.append(size);
074 sb.append("Byte");
075 return(sb.toString());
076 }
077
078 /**Returns true if given String is 7-bit clean, is is pure ASCII, or is null.
079 */
080 public static boolean isASCII7(final String s)
081 {
082 if(s == null) { return(true); }
083 for(int i = s.length(); --i >= 0; )
084 { if((s.charAt(i) & ~0x7f) != 0) { return(false); } }
085 return(true); // Seems OK.
086 }
087
088 /**Recursively copy second Node and contents into first node.
089 * This works even when the second Node is from a different Document to the first
090 * and importNode() does not work (eg due to JDK1.5 impl bug).
091 */
092 public static final void importCopy(final Node dest, final Node src)
093 {
094 if((dest == null) || (src == null))
095 { throw new IllegalArgumentException(); }
096
097 final Element cp = dest.getOwnerDocument().createElement(src.getNodeName());
098
099 // Deal with attrs, etc.
100 final boolean hasAttributes = (src.getNodeType() == Node.ELEMENT_NODE) &&
101 (src.getAttributes().getLength() > 0);
102 if(hasAttributes)
103 {
104 final NamedNodeMap attribs = src.getAttributes();
105 for(int i = 0; i < attribs.getLength(); i++)
106 { cp.setAttribute(attribs.item(i).getNodeName(), attribs.item(i).getNodeValue()); }
107 }
108
109 // Copy in any node value.
110 cp.setNodeValue(src.getNodeValue());
111
112 // Now add the new node to the destination tree.
113 dest.appendChild(cp);
114
115 // Recursively copy children.
116 for(Node child = src.getFirstChild(); child != null; child = child.getNextSibling())
117 { importCopy(cp, child); }
118 }
119
120 /**String to write as line terminator when converting to XML.
121 * A simple LF would suit UNIX-like systems
122 * though would probably be accepted by most.
123 * A bulkier CRLF is more like the Internet "standard".
124 */
125 private static final String XML_LINE_TERM = "\r\n";
126
127 /**Write DOM tree as XML/XHTML String; never "" nor null.
128 * Can write as terse/efficient single-line form or longer more human-friendly format.
129 * <p>
130 * When writing XHTML we make text nodes XML/HTML-safe, though allow embedded entities.
131 *
132 * @param node DOM tree root; never null
133 * @param toXHTML if true write with formatting suitable to include directly in HTML/XHTML for human consumption
134 * (eg using dl/ul/ol nested lists to produce formatted text representing the structure)
135 * rather than a pure XML representation of the Node tree itself
136 * @param terse if true write as compactly as possible, else make more human-readable if possible
137 */
138 public static final String toXML(final Node node, final boolean toXHTML, final boolean terse)
139 {
140 final StringBuilder result = new StringBuilder(64);
141 toXML(result, node, toXHTML, terse);
142 return(result.toString());
143 }
144
145 /**Write DOM tree as XML/XHTML String; appends to supplied StringBuilder.
146 * Can write as terse/efficient single-line form or longer more human-friendly format.
147 * <p>
148 * When writing XHTML we make text nodes XML/HTML-safe, though allow embedded entities.
149 *
150 * @param node DOM tree root; never null
151 * @param toXHTML if true write with formatting suitable to include directly in HTML/XHTML for human consumption
152 * (eg using dl/ul/ol nested lists to produce formatted text representing the structure)
153 * rather than a pure XML representation of the Node tree itself
154 * @param terse if true write as compactly as possible, else make more human-readable if possible
155 */
156 public static final void toXML(final StringBuilder result,
157 final Node node, final boolean toXHTML, final boolean terse)
158 {
159 if((node == null) || (result == null))
160 { throw new IllegalArgumentException(); }
161
162 final String nodeValue = node.getNodeValue();
163
164 final boolean hasValue = (node.getNodeValue() != null);
165
166 final boolean hasChildren = (node.getFirstChild() != null);
167
168 final boolean hasAttributes = (node.getNodeType() == Node.ELEMENT_NODE) &&
169 (node.getAttributes().getLength() > 0);
170
171 if(toXHTML) // XHTML/HTML formatted text.
172 {
173 result.append("<dl compact><dt>");
174 result.append("<b>").append(node.getNodeName()).append("</b>");
175
176 if(hasAttributes)
177 {
178 final NamedNodeMap attribs = node.getAttributes();
179 for(int i = 0; i < attribs.getLength(); i++)
180 {
181 result.append(' ');
182 result.append(attribs.item(i).getNodeName());
183 result.append("=\"");
184 result.append(escapeHTMLMetaChars(attribs.item(i).getNodeValue()));
185 result.append('"');
186 }
187 }
188
189 // Write any value with < > & " ' escaped...
190 if(hasValue) { result.append(" [").append(escapeHTMLMetaChars(nodeValue.trim())).append("] "); }
191
192 // Recursively deal with any child nodes...
193 if(hasChildren)
194 {
195 result.append("<dd>");
196 for(Node child = node.getFirstChild(); child != null; child = child.getNextSibling())
197 { toXML(result, child, toXHTML, terse); }
198 }
199
200 result.append("</dl>");
201 }
202 else // Pure XML.
203 {
204 result.append('<').append(node.getNodeName());
205
206 if(hasAttributes)
207 {
208 final NamedNodeMap attribs = node.getAttributes();
209 for(int i = 0; i < attribs.getLength(); i++)
210 {
211 result.append(' ');
212 result.append(attribs.item(i).getNodeName());
213 result.append("=\"");
214 result.append(escapeHTMLMetaChars(attribs.item(i).getNodeValue()));
215 result.append('"');
216 }
217 }
218
219 // If we have no value or children
220 // then we can use the <.../> format with no separate closing tag.
221 final boolean unitag = (!hasValue && !hasChildren);
222
223 if(unitag) { result.append('/'); }
224
225 result.append('>');
226
227 if(!unitag)
228 {
229 if(hasValue) { result.append(escapeHTMLMetaChars(nodeValue.trim())); }
230
231 for(Node child = node.getFirstChild(); child != null; child = child.getNextSibling())
232 { result.append(toXML(child, toXHTML, terse)); }
233
234 result.append("</").append(node.getNodeName()).append('>');
235 }
236 }
237
238 if(!terse) { result.append(XML_LINE_TERM); } // End the line for readability...
239 }
240
241 /**Rewrite HTML so that it displays as "raw" text and is safe to use in attribute values.
242 * Meta-characters (<, >, & " ') are rewritten
243 * to entity codes so that when shown on screen it looks like
244 * the raw HTML text when displayed in-line in HTML,
245 * and so as to avoid problems when embedded in XML and HTML.
246 * <p>
247 * If the input text does not contain these meta-characters
248 * then it is returned unchanged.
249 * <p>
250 * Additionally, all characters < 32 (ASCII control characters)
251 * are converted to spaces.
252 * <p>
253 * If null is passed in, then this returns null,
254 * to simplify its use in some cases where a null might be present.
255 *
256 * @param in String possibly containing HTML/XML meta-characters
257 */
258 public static String escapeHTMLMetaChars(final String in)
259 {
260 if(in == null) { return(null); }
261
262 // Check for "meta" characters in input.
263 // If none found then we can return the input String as-is.
264 boolean foundMetaChar = false;
265 for(int i = in.length(); --i >= 0; )
266 {
267 final char c = in.charAt(i);
268 if((c < 32) || ("<>\"'&".indexOf(c) != -1))
269 {
270 foundMetaChar = true;
271 break;
272 }
273 }
274 if(!foundMetaChar) { return(in); }
275
276 // Take a working writable copy of the input.
277 final StringBuilder work = new StringBuilder(in);
278
279 // Work backwards through the string
280 // (for efficiency and simplicity)
281 // carefully escaping the meta-characters.
282 for(int i = work.length(); --i >= 0; )
283 {
284 final char c = work.charAt(i);
285 String replacement;
286 switch(c)
287 {
288 case '<': { replacement = "<"; break; }
289 case '>': { replacement = ">"; break; }
290 case '&': { replacement = "&"; break; }
291 case '"': { replacement = """; break; }
292 case '\'': { replacement = "'"; break; }
293 default:
294 {
295 // Replace ASCII control codes...
296 if(c < 32) { replacement = " "; break; }
297
298 // No replacement needed.
299 continue;
300 }
301 }
302
303 // Delete the old character.
304 work.deleteCharAt(i);
305 // Insert the new string
306 // (we could merge these two ops for efficiency).
307 work.insert(i, replacement);
308 }
309
310 return(work.toString());
311 }
312
313 /**Private to encode8To6()/decode8To6(); automagically created on first access. */
314 private static final class Base64Cache
315 {
316 /**Prevent construction of any instance. */
317 private Base64Cache() {}
318
319 /**Private to encode8To6()/decode8To6(). */
320 private static final char[] alphabet = {
321 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', // 0 to 7
322 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', // 8 to 15
323 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', // 16 to 23
324 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', // 24 to 31
325 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', // 32 to 39
326 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', // 40 to 47
327 'w', 'x', 'y', 'z', '0', '1', '2', '3', // 48 to 55
328 '4', '5', '6', '7', '8', '9', '+', '/'}; // 56 to 63
329
330 /**Index/value of each legit 7-bit-ASCII character from the alphabet.
331 * All other (ie illegal) values map to -1.
332 * <p>
333 * This is effectively the reverse mapping for alphabet[].
334 */
335 private static final int valueOf[] = new int[128];
336
337 /**Initialise valueOf[]. */
338 static
339 {
340 for(int i = valueOf.length; --i >= 0; ) { valueOf[i] = -1; }
341 for(int i = alphabet.length; --i >= 0; ) { valueOf[alphabet[i]] = i; }
342 }
343
344 /**Gets valueOf given char; non-negative.
345 * @throws IllegalArgumentException for invalid char (not from allowed alphabet)
346 * @return value in range 0--63 inclusive
347 */
348 public static int getValueOf(final char c)
349 {
350 if(c > 128) { throw new IllegalArgumentException(); }
351 final int v = valueOf[c];
352 if(v == -1) { throw new IllegalArgumentException(); }
353 return(v);
354 }
355 }
356
357 /**Decode to a byte array (8 bit) from ASCII Base-64 (6 bit); never null.
358 * The input data must always be a multiple of 4 characters,
359 * and all character must come from the standard Base64 set [A-Za-z0-9+/=].
360 *
361 * @param base64Text data in base-46, eg as encoded by encode8To6(); never null
362 * @return binary data; never null
363 */
364 public static byte[] decode8To6(final String base64Text)
365 {
366 if(base64Text == null) { throw new IllegalArgumentException(); }
367 final int inputLen = base64Text.length();
368 // Handle empty input specially.
369 if(inputLen == 0) { return(new byte[0]); }
370 // Check input is legitimate length (eg appropriately padded with trailing '='s).
371 if((inputLen & 3) != 0)
372 { throw new IllegalArgumentException("input length must be multiple of 4: was "+base64Text.length()); }
373
374 // Compute the maximum result length (absent trailing '='s).
375 final int maxResultSize = 3 * (inputLen / 4);
376 // Compute the actual result size.
377 final boolean oneTrailingEquals = (base64Text.charAt(inputLen-1) == '=');
378 final boolean twoTrailingEquals = oneTrailingEquals && (base64Text.charAt(inputLen-2) == '=');
379 final int resultSize = twoTrailingEquals ? (maxResultSize-2) :
380 (oneTrailingEquals ? (maxResultSize-1) :
381 maxResultSize);
382 // Make the array for the final result...
383 final byte result[] = new byte[resultSize];
384
385 // Decoding TO block of 3 bytes at index i in the result
386 // FROM block of 4 characters at index j in the source.
387 for(int i = 0, j = 0; i < resultSize; i += 3, j += 4)
388 {
389 // Special treatment for the last block if it is partial.
390 final boolean partialLastBlock = ((i+3) > resultSize);
391
392 // Always the first two chars of input block must be real encoded data.
393 final int v0 = Base64Cache.getValueOf(base64Text.charAt(j+0));
394 final int v1 = Base64Cache.getValueOf(base64Text.charAt(j+1));
395 // Always at least one output byte.
396 result[i+0] = (byte)((v0 << 2) + (v1 >>> 4)); // b0 = v0v0v0v0 v0v0v1v1
397
398 if(!partialLastBlock)
399 {
400 // Full block; two more bytes to decode from three more input chars.
401 final int v2 = Base64Cache.getValueOf(base64Text.charAt(j+2));
402 final int v3 = Base64Cache.getValueOf(base64Text.charAt(j+3));
403 result[i+1] = (byte)(((v1 & 0xf) << 4) + (v2 >>> 2)); // b1 = v1v1v1v1 v2v2v2v2
404 result[i+2] = (byte)(((v2 & 3) << 6) + v3 ); // b2 = v2v2v3v3 v3v3v3v3
405 }
406 else if(!twoTrailingEquals)
407 {
408 // Partial last block with more than one byte to decode from one more input char.
409 final int v2 = Base64Cache.getValueOf(base64Text.charAt(j+2));
410 result[i+1] = (byte)(((v1 & 0xf) << 4) + (v2 >>> 2)); // b1 = v1v1v1v1 v2v2v2v2
411 }
412 }
413
414 return(result);
415 }
416
417 /**Encode a byte array (8 bit) in ASCII Base-64 (6 bit); never null.
418 * With thanks to Apache.
419 * @param data8Bit binary input data; never null
420 * @return ASCII base-64 representation as String; never null
421 */
422 public static String encode8To6(final byte[] data8Bit)
423 {
424 final StringBuilder sb = new StringBuilder(data8Bit.length * 2);
425
426 int i = 0; // Index in data8Bit[].
427 while((i + 3) <= data8Bit.length)
428 {
429 int bits24 = (data8Bit[i++] & 0xFF) << 16;
430 bits24 |= (data8Bit[i++] & 0xFF) << 8;
431 bits24 |= (data8Bit[i++] & 0xFF);
432 sb.append(Base64Cache.alphabet[(bits24 & 0x00FC0000) >> 18]);
433 sb.append(Base64Cache.alphabet[(bits24 & 0x0003F000) >> 12]);
434 sb.append(Base64Cache.alphabet[(bits24 & 0x00000FC0) >> 6]);
435 sb.append(Base64Cache.alphabet[(bits24 & 0x0000003F)]);
436 }
437
438 if(data8Bit.length - i == 2)
439 {
440 int bits24 = (data8Bit[i] & 0xFF) << 16;
441 bits24 |= (data8Bit[i + 1] & 0xFF) << 8;
442 sb.append(Base64Cache.alphabet[(bits24 & 0x00FC0000) >> 18]);
443 sb.append(Base64Cache.alphabet[(bits24 & 0x0003F000) >> 12]);
444 sb.append(Base64Cache.alphabet[(bits24 & 0x00000FC0) >> 6]);
445 sb.append('=');
446 }
447 else if(data8Bit.length - i == 1)
448 {
449 final int bits24 = (data8Bit[i] & 0xFF) << 16;
450 sb.append(Base64Cache.alphabet[(bits24 & 0x00FC0000) >> 18]);
451 sb.append(Base64Cache.alphabet[(bits24 & 0x0003F000) >> 12]);
452 sb.append('=');
453 sb.append('=');
454 }
455
456 return(sb.toString());
457 }
458
459 /**Sanitise text for XHTML/HTML/WML/XML use.
460 * The transformations are:
461 * <ul>
462 * <li>A null argument is converted to an empty string.
463 * <li>Whitespace is trimmed from the ends.
464 * <li>Converts any ``<'' to ``{,'' ``>'' to ``}''.
465 * <li>Unless in "allowEntities" modes, converts any ``&'' to ``+'',
466 * else leaves syntactically-correct entities intact and counts
467 * as a single character.
468 * <li>Any non-ASCII characters are converted to a safe value;
469 * if entity codes are permitted then these values are converted
470 * to entity codes.
471 * <li>The result is limited to at most the specified number of characters.
472 * </ul>
473 * <p>
474 * Our simple definition of a syntactically-correct entity is that
475 * it may contain any selection of ASCII digits and letters and optionally
476 * a leading hash (`#'), but certainly no whitespace. We may enforce a
477 * length limit.
478 * <p>
479 * If the input string is OK it is returned untouched.
480 * <p>
481 * TODO: Needs version with extra argument to allow entity sequences
482 * (of the form '&'xxx;) to be treated as single characters
483 * and to delete any '<'...'>' sequences,
484 * ie to do a more sophisticated job of sanitising XML/HTML
485 * that has some simple markup (primarily entity codes needed for i18n).
486 * TODO: jUnit tests
487 *
488 * @param maxLen if positive the output is limited to at most
489 * this number of characters;
490 * if >= 3 truncated text is marked with a trailing ``...''
491 * in the last three positions
492 * @param allowEntities if true, treats HTML/XML entity sequences
493 * as if single characters; the entities are vaguely tested for
494 * correct syntax and may not be allowed if invalid
495 */
496 public static String sanitiseForXML(final String input,
497 final int maxLen,
498 final boolean allowEntities)
499 {
500 if(maxLen < 0) { throw new IllegalArgumentException(); }
501
502 if(input == null) { return(""); }
503
504 // First trim whitespace from either end.
505 String in = input.trim();
506
507 // Always remove potentially-unsafe tag meta-characters.
508 in = in.replace('<', '{').replace('>', '}');
509
510 // If non-null, the input string has had all entities replaced with
511 // just the leading '&' with the entity code
512 // (not including the trailing ';')
513 // at the matching index in this array,
514 // so all entities will appear as a single character.
515 String entities[] = null;
516
517 // If not allowing entities zap all '&' chars...
518 if(!allowEntities)
519 { in = in.replace('&', '+'); }
520 // If we are allowing entities, and there are any,
521 // convert them all to placeholders.
522 else if(in.indexOf('&') >= 0)
523 {
524 // in may get truncated but not longer while this array is in use.
525 entities = new String[in.length()];
526
527 // Break out the (valid) entities one at a time...
528 for(int nextAmp = -1; (nextAmp = in.indexOf('&', nextAmp + 1)) != -1; )
529 {
530 // Look for the next semicolon...
531 final int nextSc = in.indexOf(';', nextAmp + 1);
532 if(nextSc == -1)
533 {
534 // Whoops; if no more semicolons, then
535 // all ampersands from current are bogus and we can stop.
536 in = ((nextAmp >= 0) ? in.substring(0, nextAmp) : "") +
537 (in.substring(nextAmp).replace('&', '+'));
538 break;
539 }
540
541
542 // TODO: do more validity checking of a putative entity:
543 // check that it letters and digits or a hash and all digits.
544
545
546 // Now assume that the entity is valid,
547 // so break it out into the entities array
548 // leaving the '&' behind
549 // and putting the body (not the trailing ';')
550 // in the entities[] array at the '&' index.
551 entities[nextAmp] = in.substring(nextAmp+1, nextSc);
552 in = in.substring(0, nextAmp+1) +
553 in.substring(nextSc+1);
554 }
555 }
556
557 // Truncate if necessary while
558 // any entities that we care about are "broken out"...
559 if((maxLen >= 0) && (in.length() > maxLen))
560 {
561 if(maxLen >= 3) { in = in.substring(0, maxLen-3) + "..."; }
562 else { in = in.substring(0, maxLen); }
563 }
564
565 // If we broke out any entities,
566 // put any remaining ones back in-line.
567 if(entities != null)
568 {
569 final StringBuilder sb = new StringBuilder(in.length()); // At least this long.
570
571 final int len = in.length();
572 for(int i = 0; i < len; ++i)
573 {
574 final char c = in.charAt(i);
575 sb.append(c);
576 if(c != '&')
577 { continue; } // Usual case; no entity involved.
578
579 // Expand an entity.
580 assert(entities[i] != null);
581 sb.append(entities[i]).append(';');
582 }
583
584 // Convert back to expanded string.
585 in = sb.toString();
586
587 // entities = null; // Allow GC.
588 }
589
590 // Look for any char outwith range 32--126.
591 // If one is found then replace every occurrence with ' ' and retrim at end,
592 // unless allowing entities in which case replace with an entity code.
593 for(int i = in.length(); --i >= 0; )
594 {
595 final char c = in.charAt(i);
596 if((c < 32) || (c > 126))
597 {
598 // Whole string needs fixing...
599 final StringBuilder sb = new StringBuilder(2 * in.length());
600 for(int j = 0; j < in.length(); ++j)
601 {
602 final char d = in.charAt(j);
603 if((d < 32) || (d > 126))
604 {
605 // If allowing entities,
606 // convert non-ASCII chars to entities.
607 if(!allowEntities) { sb.append(' '); }
608 else { sb.append("&#").append(0xffff & d).append(';'); }
609 }
610 else { sb.append(d); }
611 }
612 in = sb.toString();
613 if(!allowEntities) { in = in.trim(); }
614 break;
615 }
616 }
617
618 if(input.equals(in)) { return(input); } // Avoid making copies!
619 return(in);
620 }
621
622
623 /**Extension for a CharSequence that holds only 8-bit character values.
624 * The implementation of this may often hold the underlying data as a byte array.
625 */
626 public static interface CharSequence8Bit extends CharSequence
627 {
628 /**Return private copy (8-bit) text with each byte corresponding to one char; never null.
629 * May be far more efficient than converting via a String
630 * if the aim is to write directly to a file for example.
631 */
632 public byte[] toByteArray();
633
634 /**Returns byte value at given index.
635 * Like charAt(), but avoids explicit conversion to char.
636 */
637 public byte byteAt(final int i);
638 }
639
640 /**Extension (marker interface) for a CharSequence that holds only 7-bit character values.
641 * The implementation of this may often hold the underlying data as a byte array.
642 */
643 public static interface CharSequence7Bit extends CharSequence8Bit { }
644
645 /**Orders CharSequence objects as if by String.compareTo(); not null.
646 * If both arguments to compare() are the same reference (even null) this returns 0 immediately.
647 */
648 public static final Comparator<CharSequence> CASE_SENSITIVE_ORDER = new Comparator<CharSequence>(){
649 public int compare(final CharSequence cs1, final CharSequence cs2)
650 {
651 if(cs1 == cs2) { return(0); } // Optimisation...
652 final int l1 = cs1.length();
653 final int l2 = cs2.length();
654 final int l = Math.min(l1, l2);
655 for(int i = 0; i < l; ++i)
656 {
657 final char c1 = cs1.charAt(i);
658 final char c2 = cs2.charAt(i);
659 if(c1 != c2)
660 { return(c1 - c2); }
661 }
662 return(l1 - l2);
663 }
664 };
665
666 /**Orders CharSequence objects as if by String.compareToIgnoreCase(); not null.
667 * If both arguments to compare() are the same reference (even null) this returns 0 immediately.
668 */
669 public static final Comparator<CharSequence> CASE_INSENSITIVE_ORDER = new Comparator<CharSequence>(){
670 public int compare(final CharSequence cs1, final CharSequence cs2)
671 {
672 if(cs1 == cs2) { return(0); } // Optimisation...
673 final int l1 = cs1.length();
674 final int l2 = cs2.length();
675 final int l = Math.min(l1, l2);
676 for(int i = 0; i < l; ++i)
677 {
678 char c1 = cs1.charAt(i);
679 char c2 = cs2.charAt(i);
680 if(c1 != c2)
681 {
682 c1 = Character.toUpperCase(c1);
683 c2 = Character.toUpperCase(c2);
684 if(c1 != c2)
685 {
686 c1 = Character.toLowerCase(c1);
687 c2 = Character.toLowerCase(c2);
688 if (c1 != c2)
689 { return(c1 - c2); }
690 }
691 }
692 }
693 return(l1 - l2);
694 }
695 };
696
697 /**First index of specified character in given (non-null) CharSequence as for String.
698 * @return index of first occurrence of c, else -1 if no occurrence of c
699 */
700 public static final int indexOf(final CharSequence cs, final char c)
701 { return(TextUtils.indexOf(cs, c, 0)); }
702
703 /**First index of specified character in given (non-null) CharSequence from/after specified index as for String.
704 * @return index of first occurrence of c, else -1 if no occurrence of c
705 */
706 public static final int indexOf(final CharSequence cs, final char c, final int fromIndex)
707 {
708 if(cs == null) { throw new IllegalArgumentException(); }
709 final int len = cs.length();
710 for(int i = Math.max(0, fromIndex); i < len; ++i)
711 { if(c == cs.charAt(i)) { return(i); } }
712 return(-1);
713 }
714
715 /**Last index of specified character in given (non-null) CharSequence as for String.
716 * @return index of last occurrence of c, else -1 if no occurrence of c
717 */
718 public static final int lastIndexOf(final CharSequence cs, final char c)
719 { return(TextUtils.lastIndexOf(cs, c, Integer.MAX_VALUE)); }
720
721 /**Last index of specified character in given (non-null) CharSequence from/before specified start index as for String.
722 * @return index of last occurrence of c, else -1 if no occurrence of c
723 */
724 public static final int lastIndexOf(final CharSequence cs, final char c, final int fromIndex)
725 {
726 if(cs == null) { throw new IllegalArgumentException(); }
727 final int len = cs.length();
728 for(int i = ((fromIndex >= len) ? len - 1 : fromIndex); --i >= 0; )
729 { if(c == cs.charAt(i)) { return(i); } }
730 return(-1);
731 }
732
733 /**Checks that the specified region of two (non-null) CharSequences matches exactly (as for String).
734 */
735 public static boolean regionMatches(final CharSequence cs1, final int start1,
736 final CharSequence cs2, final int start2,
737 final int len)
738 {
739 if((null == cs1) || (null == cs2)) { throw new IllegalArgumentException(); }
740 if((start1 < 0) || (start2 < 0)) { throw new IllegalArgumentException(); }
741 if((start1 + len > cs1.length()) || (start2 + len > cs2.length())) { throw new IllegalArgumentException(); }
742 for(int i = len; --i >= 0; )
743 { if(cs1.charAt(start1 + i) != cs2.charAt(start2 + i)) { return(false); } }
744 return(true);
745 }
746
747 /**Checks that two CharSequences are both null or represent the same (non-null) sequence of chars. */
748 public static boolean contentEqualsOrBothNull(final CharSequence cs1, final CharSequence cs2)
749 {
750 if(cs1 == cs2) { return(true); } // Optimisation; both null or same non-null item.
751 if((null == cs1) != (null == cs2)) { return(false); } // Only one is null.
752 final int len = cs1.length();
753 if(len != cs2.length()) { return(false); }
754 for(int i = len; --i >= 0; )
755 { if(cs1.charAt(i) != cs2.charAt(i)) { return(false); } }
756 return(true);
757 }
758
759 /**Checks that two (non-null) 8-bit CharSequences represent the same sequence of chars/bytes. */
760 public static boolean contentEquals(final CharSequence8Bit cs1, final CharSequence8Bit cs2)
761 {
762 if((null == cs1) || (null == cs2)) { throw new IllegalArgumentException(); }
763 if(cs1 == cs2) { return(true); } // Optimisation.
764 final int len = cs1.length();
765 if(len != cs2.length()) { return(false); }
766 for(int i = len; --i >= 0; )
767 { if(cs1.byteAt(i) != cs2.byteAt(i)) { return(false); } }
768 return(true);
769 }
770
771 /**Checks that two (non-null) CharSequences represent the same sequence of chars. */
772 public static boolean contentEquals(final CharSequence cs1, final CharSequence cs2)
773 {
774 if((null == cs1) || (null == cs2)) { throw new IllegalArgumentException(); }
775 if(cs1 == cs2) { return(true); } // Optimisation.
776 final int len = cs1.length();
777 if(len != cs2.length()) { return(false); }
778 for(int i = len; --i >= 0; )
779 { if(cs1.charAt(i) != cs2.charAt(i)) { return(false); } }
780 return(true);
781 }
782
783 /**Checks that two (non-null) CharSequences represent the same sequence of chars, ignoring case. */
784 public static boolean contentEqualsIgnoreCase(final CharSequence cs1, final CharSequence cs2)
785 {
786 if((null == cs1) || (null == cs2)) { throw new IllegalArgumentException(); }
787 if(cs1 == cs2) { return(true); } // Optimisation.
788 final int len = cs1.length();
789 if(len != cs2.length()) { return(false); }
790 // Use our case-insensitive comparator for consistency.
791 return(0 == CASE_INSENSITIVE_ORDER.compare(cs1, cs2));
792 }
793
794 /**Compares two (non-null) CharSequences for lexical order. */
795 public static int compare(final CharSequence cs1, final CharSequence cs2)
796 {
797 // Arg check for non-null omitted as performance-critical as of 2012/02/26, and not all JITs smart enough to elide.
798 //if((null == cs1) || (null == cs2)) { throw new IllegalArgumentException(); }
799 final int l1 = cs1.length();
800 final int l2 = cs2.length();
801 final int l = Math.min(l1, l2);
802 for(int i = 0; i < l; ++i)
803 {
804 final char c1 = cs1.charAt(i);
805 final char c2 = cs2.charAt(i);
806 if(c1 != c2) { return(c1 - c2); }
807 }
808 return(l1 - l2);
809 }
810
811 /**Returns true if the first sequence starts with the second (neither null), else false. */
812 public static boolean startsWith(final CharSequence mainText,
813 final CharSequence putativePrefix)
814 {
815 final int ppl = putativePrefix.length();
816 return((mainText.length() >= ppl) &&
817 regionMatches(mainText, 0, putativePrefix, 0, ppl));
818 }
819
820 /**Returns true if the first sequence ends with the second (neither null), else false. */
821 public static boolean endsWith(final CharSequence mainText,
822 final CharSequence putativeSuffix)
823 {
824 final int psl = putativeSuffix.length();
825 final int ml = mainText.length();
826 return((ml >= psl) &&
827 regionMatches(mainText, ml-psl, putativeSuffix, 0, psl));
828 }
829
830 /**Create and populate a SortedSet of CharSequence in natural/total case-sensitive order that will work with any mix of immutable CharSequence keys; never null.
831 * This may not be consistent with equals() if keys are of mixed types.
832 * <p>
833 * The result is not thread-safe and is mutable.
834 *
835 * @param csc initial collection to populate from; can be null to avoid initial population
836 */
837 public static <T extends CharSequence> SortedSet<T> createCharSequenceSortedSet(final Collection<T> csc)
838 { return(createCharSequenceSortedSet(csc, TextUtils.CASE_SENSITIVE_ORDER)); }
839
840 /**Create and populate a SortedSet of CharSequence in specified order that will work with any mix of immutable CharSequence keys; never null.
841 * This may not be consistent with equals() if keys are of mixed types.
842 * <p>
843 * The result is not thread-safe and is mutable.
844 *
845 * @param csc initial collection to populate from; can be null to avoid initial population
846 * @param comparator used to order the items; never null
847 */
848 public static <T extends CharSequence> SortedSet<T> createCharSequenceSortedSet(final Collection<T> csc, final Comparator<CharSequence> comparator)
849 {
850 if(null == comparator) { throw new IllegalArgumentException(); }
851 final SortedSet<T> result = new TreeSet<T>(comparator);
852 if(null != csc) { result.addAll(csc); }
853 return(result);
854 }
855
856 /**Minimum-character entity to hold as Name with its high overheads; non-negative.
857 * If smaller than this then CS8Bit may be used instead,
858 * with internal de-duping of values and keys
859 * rather than the global intern()-driven system used by Name.
860 * <p>
861 * Empirically determined by distribution of sizes of keys and values!
862 */
863 private static final int MIN_NAME_CHARS_FOR_EFFICIENT_TEXT_REPRESENTATION = 16;
864
865 /**Make an efficient representation for possibly non-unique text to be held in memory long-term.
866 * Does not make reference to other existing values that may be held together.
867 * <p>
868 * A null input results in a null.
869 * <p>
870 * A non-8-bit (or empty) text results in a String result.
871 * <p>
872 * A short (8-bit) text will result in an non-intern()ed CS8Bit result.
873 * <p>
874 * Else an (implicitly intern()ed) Name is returned.
875 * <p>
876 * This aims to avoid intern() overhead for small values.
877 *
878 * @param value the value to optimise
879 * @param prevRef if non-null, used to convey Name values from one call to the next
880 * to help provide a more-compact representation by sharing prefixes and suffixes
881 */
882 public static CharSequence makeEfficientTextRepresentation(final CharSequence value,
883 final AtomicReference<Name> prevRef)
884 {
885 if(null == value) { return(null); }
886 final int length = value.length();
887 if(0 == length) { return(""); }
888
889 CharSequence vcs;
890 try
891 {
892 // If the value is short then use CS8Bit and don't intern() it because of overhead.
893 // If the supplied value has the correct type then use as-is,
894 // else try to convert it to the target type,
895 // falling back to String if not possible (eg non-8-bit characters present).
896 if(length < MIN_NAME_CHARS_FOR_EFFICIENT_TEXT_REPRESENTATION)
897 {
898 final CS8Bit c = (value.getClass() == CS8Bit.class) ? (CS8Bit) value : new CS8Bit(value);
899 vcs = c;
900 }
901 else
902 {
903 final Name n = (value.getClass() == Name.class) ? (Name) value :
904 Name.create(value, (null != prevRef) ? prevRef.get() : null);
905 vcs = n;
906 prevRef.set(n);
907 }
908 }
909 // If conversion of value to Name fails then convert to String and intern().
910 // We should only need a String form for non-8-bit values.
911 catch(final IllegalArgumentException e)
912 { vcs = MemoryTools.intern(value.toString()); }
913 return vcs;
914 }
915
916 /**Efficient conversion from 7-bit or 8-bit text (once char per byte) to String; never null.
917 * The array passed is not altered in any way.
918 * @param ascii text; never null but may be empty
919 * @return new String instance; never null
920 */
921 public static String toString(final byte[] ascii)
922 { return(new String(ascii, 0)); } // The deprecated method is by far the most straightforward.
923
924 /**Return a hash code the same as or similar to that of a String containing the same characters. */
925 public static int hashCode(final CharSequence cs)
926 {
927 final int len = cs.length();
928 int h = 0;
929 for(int i = 0; i < len; i++)
930 { h = 31 * h + cs.charAt(i); }
931 return(h);
932 }
933
934 /**Return an ASCII printable hex hash code; never null nor empty. */
935 public static String hashCodeHexString(final CharSequence cs)
936 { return(Integer.toHexString(hashCode(cs)));}
937
938 /**Compiled regex to match inter-word gaps in plain (mainly English) text; non-null. */
939 private static final Pattern wordBoundaryRegex = Pattern.compile("\\s+");
940
941 /**Quickly attempt to count the words in plain text; non-negative. */
942 public static int quickWordCount(final CharSequence text)
943 {
944 int matchCount = 0;
945
946 // Count segments before each match.
947 final Matcher m = wordBoundaryRegex.matcher(text);
948 int index = 0;
949 while(m.find())
950 {
951 final String match = text.subSequence(index, m.start()).toString();
952 if(0 != match.length()) { ++matchCount; }
953 index = m.end();
954 }
955
956 final int length = text.length();
957
958 // If no match was found, return 0 if string empty else 1 (assume text contains one word).
959 if (index == 0) { return((length == 0) ? 0 : 1); }
960
961 // Add any non-empty trailing segment.
962 if(index < length) { ++matchCount; }
963
964 return(matchCount);
965 }
966 }