@@ -39,6 +39,8 @@ public abstract class Base64 {
3939
4040 private static final byte [] decodeMap = new byte [256 ];
4141 private static final int [] decodeMapInt = new int [256 ];
42+ private static final int SIMD_LANES = 16 ;
43+ private static final int SIMD_SCRATCH_INTS = 192 ;
4244
4345 static {
4446 for (int i = 0 ; i < decodeMap .length ; i ++) {
@@ -79,15 +81,15 @@ public static byte[] decode(byte[] in, int len) {
7981 return new byte [0 ];
8082 }
8183 int maxOutputLength = (len / 4 ) * 3 + 3 ;
82- byte [] out = new byte [ maxOutputLength ] ;
84+ byte [] out = allocByteMaybeSimd ( maxOutputLength ) ;
8385 int outputLength = decode (in , len , out );
8486 if (outputLength < 0 ) {
8587 return null ;
8688 }
8789 if (outputLength == out .length ) {
8890 return out ;
8991 }
90- byte [] trimmed = new byte [ outputLength ] ;
92+ byte [] trimmed = allocByteMaybeSimd ( outputLength ) ;
9193 System .arraycopy (out , 0 , trimmed , 0 , outputLength );
9294 return trimmed ;
9395 }
@@ -224,8 +226,9 @@ private static int decodeNoWhitespace(byte[] in, int len, byte[] out) {
224226 int outIndex = 0 ;
225227 int fullLen = len - (pad > 0 ? 4 : 0 );
226228 int [] decodeMapLocal = decodeMapInt ;
229+ int simdFullLen = 0 ;
227230
228- for (int i = 0 ; i < fullLen ; i += 4 ) {
231+ for (int i = simdFullLen ; i < fullLen ; i += 4 ) {
229232 int c0 = in [i ] & 0xff ;
230233 int c1 = in [i + 1 ] & 0xff ;
231234 int c2 = in [i + 2 ] & 0xff ;
@@ -337,7 +340,7 @@ public static String encodeNoNewline(byte[] in) {
337340 return "" ;
338341 }
339342 int outputLength = ((inputLength + 2 ) / 3 ) * 4 ;
340- byte [] out = new byte [ outputLength ] ;
343+ byte [] out = allocByteMaybeSimd ( outputLength ) ;
341344 encodeNoNewline (in , out );
342345 return com .codename1 .util .StringUtil .newString (out , 0 , outputLength );
343346 }
@@ -433,4 +436,292 @@ public static int encodeNoNewline(byte[] in, byte[] out) {
433436 }
434437 return outIndex ;
435438 }
439+
440+ /// SIMD-optimized Base64 encoding with explicit offsets and caller scratch.
441+ /// Scratch layout: a single SIMD-allocated `int[]` buffer of at least 192 ints.
442+ ///
443+ /// Usage example:
444+ /// ```java
445+ /// Simd simd = Simd.get();
446+ /// byte[] input = simd.allocByte(data.length);
447+ /// System.arraycopy(data, 0, input, 0, data.length);
448+ /// byte[] output = simd.allocByte(((data.length + 2) / 3) * 4);
449+ /// int[] scratch = simd.allocInt(192);
450+ /// int written = Base64.encodeNoNewlineSimd(input, 0, input.length, output, 0, scratch);
451+ /// ```
452+ @ DisableDebugInfo
453+ @ DisableNullChecksAndArrayBoundsChecks
454+ public static int encodeNoNewlineSimd (byte [] in , int inOffset , int inLength , byte [] out , int outOffset , int [] scratch ) {
455+ Simd simd = Simd .get ();
456+ int outputLength = ((inLength + 2 ) / 3 ) * 4 ;
457+ if (out .length - outOffset < outputLength ) {
458+ throw new IllegalArgumentException ("Output buffer too small for encoded data" );
459+ }
460+ if (inLength == 0 ) {
461+ return 0 ;
462+ }
463+ requireScratch (scratch );
464+ requireSimdApiArrays (simd , in , out , scratch );
465+
466+ final int b0 = 0 ;
467+ final int b1 = b0 + SIMD_LANES ;
468+ final int b2 = b1 + SIMD_LANES ;
469+ final int s0 = b2 + SIMD_LANES ;
470+ final int s1 = s0 + SIMD_LANES ;
471+ final int s2 = s1 + SIMD_LANES ;
472+ final int s3 = s2 + SIMD_LANES ;
473+ final int t0 = s3 + SIMD_LANES ;
474+ final int t1 = t0 + SIMD_LANES ;
475+ final int c3 = t1 + SIMD_LANES ;
476+ final int c15 = c3 + SIMD_LANES ;
477+ final int c63 = c15 + SIMD_LANES ;
478+
479+ for (int lane = 0 ; lane < SIMD_LANES ; lane ++) {
480+ scratch [c3 + lane ] = 3 ;
481+ scratch [c15 + lane ] = 15 ;
482+ scratch [c63 + lane ] = 63 ;
483+ }
484+
485+ int end = inOffset + inLength - (inLength % 3 );
486+ int simdEnd = end - ((end - inOffset ) % 48 );
487+ int inIndex = inOffset ;
488+ int outIndex = outOffset ;
489+ for (; inIndex < simdEnd ; inIndex += 48 ) {
490+ for (int lane = 0 ; lane < SIMD_LANES ; lane ++) {
491+ int src = inIndex + lane * 3 ;
492+ scratch [b0 + lane ] = in [src ] & 0xff ;
493+ scratch [b1 + lane ] = in [src + 1 ] & 0xff ;
494+ scratch [b2 + lane ] = in [src + 2 ] & 0xff ;
495+ }
496+
497+ simd .shrLogical (scratch , b0 , 2 , scratch , s0 , SIMD_LANES );
498+ simd .and (scratch , b0 , scratch , c3 , scratch , t0 , SIMD_LANES );
499+ simd .shl (scratch , t0 , 4 , scratch , t0 , SIMD_LANES );
500+ simd .shrLogical (scratch , b1 , 4 , scratch , t1 , SIMD_LANES );
501+ simd .or (scratch , t0 , scratch , t1 , scratch , s1 , SIMD_LANES );
502+ simd .and (scratch , b1 , scratch , c15 , scratch , t0 , SIMD_LANES );
503+ simd .shl (scratch , t0 , 2 , scratch , t0 , SIMD_LANES );
504+ simd .shrLogical (scratch , b2 , 6 , scratch , t1 , SIMD_LANES );
505+ simd .or (scratch , t0 , scratch , t1 , scratch , s2 , SIMD_LANES );
506+ simd .and (scratch , b2 , scratch , c63 , scratch , s3 , SIMD_LANES );
507+
508+ for (int lane = 0 ; lane < SIMD_LANES ; lane ++) {
509+ out [outIndex ++] = map [scratch [s0 + lane ]];
510+ out [outIndex ++] = map [scratch [s1 + lane ]];
511+ out [outIndex ++] = map [scratch [s2 + lane ]];
512+ out [outIndex ++] = map [scratch [s3 + lane ]];
513+ }
514+ }
515+
516+ for (; inIndex < end ; inIndex += 3 ) {
517+ int x0 = in [inIndex ] & 0xff ;
518+ int x1 = in [inIndex + 1 ] & 0xff ;
519+ int x2 = in [inIndex + 2 ] & 0xff ;
520+ out [outIndex ++] = map [x0 >> 2 ];
521+ out [outIndex ++] = map [((x0 & 0x03 ) << 4 ) | (x1 >> 4 )];
522+ out [outIndex ++] = map [((x1 & 0x0f ) << 2 ) | (x2 >> 6 )];
523+ out [outIndex ++] = map [x2 & 0x3f ];
524+ }
525+
526+ switch (inOffset + inLength - end ) {
527+ case 1 : {
528+ int x0 = in [end ] & 0xff ;
529+ out [outIndex ++] = map [x0 >> 2 ];
530+ out [outIndex ++] = map [(x0 & 0x03 ) << 4 ];
531+ out [outIndex ++] = '=' ;
532+ out [outIndex ++] = '=' ;
533+ break ;
534+ }
535+ case 2 : {
536+ int x0 = in [end ] & 0xff ;
537+ int x1 = in [end + 1 ] & 0xff ;
538+ out [outIndex ++] = map [x0 >> 2 ];
539+ out [outIndex ++] = map [((x0 & 0x03 ) << 4 ) | (x1 >> 4 )];
540+ out [outIndex ++] = map [(x1 & 0x0f ) << 2 ];
541+ out [outIndex ++] = '=' ;
542+ break ;
543+ }
544+ default :
545+ break ;
546+ }
547+ return outputLength ;
548+ }
549+
550+ /// SIMD-optimized Base64 decoding for no-whitespace input.
551+ /// Scratch layout: a single SIMD-allocated `int[]` buffer of at least 192 ints.
552+ ///
553+ /// Returns decoded bytes written, or `-1` for invalid input.
554+ ///
555+ /// Usage example:
556+ /// ```java
557+ /// Simd simd = Simd.get();
558+ /// byte[] encoded = simd.allocByte(base64Bytes.length);
559+ /// System.arraycopy(base64Bytes, 0, encoded, 0, base64Bytes.length);
560+ /// byte[] decoded = simd.allocByte((encoded.length / 4) * 3);
561+ /// int[] scratch = simd.allocInt(192);
562+ /// int written = Base64.decodeNoWhitespaceSimd(encoded, 0, encoded.length, decoded, 0, scratch);
563+ /// ```
564+ @ DisableDebugInfo
565+ @ DisableNullChecksAndArrayBoundsChecks
566+ public static int decodeNoWhitespaceSimd (byte [] in , int inOffset , int inLength , byte [] out , int outOffset , int [] scratch ) {
567+ if ((inLength & 0x3 ) != 0 ) {
568+ return -1 ;
569+ }
570+ int pad = 0 ;
571+ if (inLength > 0 && in [inOffset + inLength - 1 ] == '=' ) {
572+ pad ++;
573+ if (inLength > 1 && in [inOffset + inLength - 2 ] == '=' ) {
574+ pad ++;
575+ }
576+ }
577+ if (pad > 2 ) {
578+ return -1 ;
579+ }
580+ int outLength = (inLength / 4 ) * 3 - pad ;
581+ if (outLength <= 0 ) {
582+ return 0 ;
583+ }
584+ if (out .length - outOffset < outLength ) {
585+ throw new IllegalArgumentException ("Output buffer too small for decoded data" );
586+ }
587+
588+ requireScratch (scratch );
589+ Simd simd = Simd .get ();
590+ requireSimdApiArrays (simd , in , out , scratch );
591+
592+ final int q0 = 0 ;
593+ final int q1 = q0 + SIMD_LANES ;
594+ final int q2 = q1 + SIMD_LANES ;
595+ final int q3 = q2 + SIMD_LANES ;
596+ final int o0 = q3 + SIMD_LANES ;
597+ final int o1 = o0 + SIMD_LANES ;
598+ final int o2 = o1 + SIMD_LANES ;
599+ final int t0 = o2 + SIMD_LANES ;
600+ final int t1 = t0 + SIMD_LANES ;
601+ final int c3 = t1 + SIMD_LANES ;
602+ final int c15 = c3 + SIMD_LANES ;
603+
604+ for (int lane = 0 ; lane < SIMD_LANES ; lane ++) {
605+ scratch [c3 + lane ] = 3 ;
606+ scratch [c15 + lane ] = 15 ;
607+ }
608+
609+ int fullLen = inLength - (pad > 0 ? 4 : 0 );
610+ int simdFullLen = fullLen - (fullLen % 64 );
611+ int inIndex = inOffset ;
612+ int outIndex = outOffset ;
613+ int endVector = inOffset + simdFullLen ;
614+ for (; inIndex < endVector ; inIndex += 64 ) {
615+ for (int lane = 0 ; lane < SIMD_LANES ; lane ++) {
616+ int src = inIndex + lane * 4 ;
617+ int d0 = decodeMapInt [in [src ] & 0xff ];
618+ int d1 = decodeMapInt [in [src + 1 ] & 0xff ];
619+ int d2 = decodeMapInt [in [src + 2 ] & 0xff ];
620+ int d3 = decodeMapInt [in [src + 3 ] & 0xff ];
621+ if ((d0 | d1 | d2 | d3 ) < 0 ) {
622+ return -1 ;
623+ }
624+ scratch [q0 + lane ] = d0 ;
625+ scratch [q1 + lane ] = d1 ;
626+ scratch [q2 + lane ] = d2 ;
627+ scratch [q3 + lane ] = d3 ;
628+ }
629+
630+ simd .shl (scratch , q0 , 2 , scratch , o0 , SIMD_LANES );
631+ simd .shrLogical (scratch , q1 , 4 , scratch , t0 , SIMD_LANES );
632+ simd .or (scratch , o0 , scratch , t0 , scratch , o0 , SIMD_LANES );
633+ simd .and (scratch , q1 , scratch , c15 , scratch , t0 , SIMD_LANES );
634+ simd .shl (scratch , t0 , 4 , scratch , t0 , SIMD_LANES );
635+ simd .shrLogical (scratch , q2 , 2 , scratch , t1 , SIMD_LANES );
636+ simd .or (scratch , t0 , scratch , t1 , scratch , o1 , SIMD_LANES );
637+ simd .and (scratch , q2 , scratch , c3 , scratch , t0 , SIMD_LANES );
638+ simd .shl (scratch , t0 , 6 , scratch , t0 , SIMD_LANES );
639+ simd .or (scratch , t0 , scratch , q3 , scratch , o2 , SIMD_LANES );
640+
641+ for (int lane = 0 ; lane < SIMD_LANES ; lane ++) {
642+ out [outIndex ++] = (byte )scratch [o0 + lane ];
643+ out [outIndex ++] = (byte )scratch [o1 + lane ];
644+ out [outIndex ++] = (byte )scratch [o2 + lane ];
645+ }
646+ }
647+
648+ int fullEnd = inOffset + fullLen ;
649+ for (; inIndex < fullEnd ; inIndex += 4 ) {
650+ int c0 = in [inIndex ] & 0xff ;
651+ int c1 = in [inIndex + 1 ] & 0xff ;
652+ int c2 = in [inIndex + 2 ] & 0xff ;
653+ int c3v = in [inIndex + 3 ] & 0xff ;
654+ int x0 = decodeMapInt [c0 ];
655+ int x1 = decodeMapInt [c1 ];
656+ int x2 = decodeMapInt [c2 ];
657+ int x3 = decodeMapInt [c3v ];
658+ if ((x0 | x1 | x2 | x3 ) < 0 ) {
659+ return -1 ;
660+ }
661+ int quantum = (x0 << 18 ) | (x1 << 12 ) | (x2 << 6 ) | x3 ;
662+ out [outIndex ++] = (byte )((quantum >> 16 ) & 0xff );
663+ out [outIndex ++] = (byte )((quantum >> 8 ) & 0xff );
664+ out [outIndex ++] = (byte )(quantum & 0xff );
665+ }
666+
667+ if (pad == 0 ) {
668+ return outLength ;
669+ }
670+
671+ int i = inOffset + inLength - 4 ;
672+ int c0 = in [i ] & 0xff ;
673+ int c1 = in [i + 1 ] & 0xff ;
674+ int x0 = decodeMapInt [c0 ];
675+ int x1 = decodeMapInt [c1 ];
676+ if ((x0 | x1 ) < 0 ) {
677+ return -1 ;
678+ }
679+ out [outIndex ++] = (byte )((x0 << 2 ) | (x1 >> 4 ));
680+ if (pad == 2 ) {
681+ return (in [i + 2 ] == '=' && in [i + 3 ] == '=' ) ? outLength : -1 ;
682+ }
683+ if (in [i + 3 ] != '=' ) {
684+ return -1 ;
685+ }
686+ int x2 = decodeMapInt [in [i + 2 ] & 0xff ];
687+ if (x2 < 0 ) {
688+ return -1 ;
689+ }
690+ out [outIndex ] = (byte )((x1 << 4 ) | (x2 >> 2 ));
691+ return outLength ;
692+ }
693+
694+ /// Convenience overload for `encodeNoNewlineSimd(byte[], int, int, byte[], int, int[])`
695+ /// using zero offsets.
696+ public static int encodeNoNewlineSimd (byte [] in , byte [] out , int [] scratch ) {
697+ return encodeNoNewlineSimd (in , 0 , in .length , out , 0 , scratch );
698+ }
699+
700+ /// Convenience overload for `decodeNoWhitespaceSimd(byte[], int, int, byte[], int, int[])`
701+ /// using zero offsets.
702+ public static int decodeNoWhitespaceSimd (byte [] in , int len , byte [] out , int [] scratch ) {
703+ return decodeNoWhitespaceSimd (in , 0 , len , out , 0 , scratch );
704+ }
705+
706+ private static void requireScratch (int [] scratch ) {
707+ if (scratch == null || scratch .length < SIMD_SCRATCH_INTS ) {
708+ throw new IllegalArgumentException ("scratch must be an int[] allocated with Simd.allocInt(192) or larger" );
709+ }
710+ }
711+
712+ private static void requireSimdApiArrays (Simd simd , byte [] in , byte [] out , int [] scratch ) {
713+ simd .unpackUnsignedByteToInt (in , scratch , 0 , 0 );
714+ simd .packIntToByteTruncate (scratch , out , 0 , 0 );
715+ }
716+
717+ private static byte [] allocByteMaybeSimd (int size ) {
718+ if (size <= 0 ) {
719+ return new byte [0 ];
720+ }
721+ Simd simd = Simd .get ();
722+ if (simd .isSupported () && size >= 16 ) {
723+ return simd .allocByte (size );
724+ }
725+ return new byte [size ];
726+ }
436727}
0 commit comments