Skip to content

Commit d49f9af

Browse files
committed
Added SIMD Support for Base64 algorithm
1 parent e9708cf commit d49f9af

File tree

7 files changed

+1444
-18
lines changed

7 files changed

+1444
-18
lines changed

CodenameOne/src/com/codename1/util/Base64.java

Lines changed: 295 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ public abstract class Base64 {
3939

4040
private static final byte[] decodeMap = new byte[256];
4141
private static final int[] decodeMapInt = new int[256];
42+
private static final int SIMD_LANES = 16;
43+
private static final int SIMD_SCRATCH_INTS = 192;
4244

4345
static {
4446
for (int i = 0; i < decodeMap.length; i++) {
@@ -79,15 +81,15 @@ public static byte[] decode(byte[] in, int len) {
7981
return new byte[0];
8082
}
8183
int maxOutputLength = (len / 4) * 3 + 3;
82-
byte[] out = new byte[maxOutputLength];
84+
byte[] out = allocByteMaybeSimd(maxOutputLength);
8385
int outputLength = decode(in, len, out);
8486
if (outputLength < 0) {
8587
return null;
8688
}
8789
if (outputLength == out.length) {
8890
return out;
8991
}
90-
byte[] trimmed = new byte[outputLength];
92+
byte[] trimmed = allocByteMaybeSimd(outputLength);
9193
System.arraycopy(out, 0, trimmed, 0, outputLength);
9294
return trimmed;
9395
}
@@ -224,8 +226,9 @@ private static int decodeNoWhitespace(byte[] in, int len, byte[] out) {
224226
int outIndex = 0;
225227
int fullLen = len - (pad > 0 ? 4 : 0);
226228
int[] decodeMapLocal = decodeMapInt;
229+
int simdFullLen = 0;
227230

228-
for (int i = 0; i < fullLen; i += 4) {
231+
for (int i = simdFullLen; i < fullLen; i += 4) {
229232
int c0 = in[i] & 0xff;
230233
int c1 = in[i + 1] & 0xff;
231234
int c2 = in[i + 2] & 0xff;
@@ -337,7 +340,7 @@ public static String encodeNoNewline(byte[] in) {
337340
return "";
338341
}
339342
int outputLength = ((inputLength + 2) / 3) * 4;
340-
byte[] out = new byte[outputLength];
343+
byte[] out = allocByteMaybeSimd(outputLength);
341344
encodeNoNewline(in, out);
342345
return com.codename1.util.StringUtil.newString(out, 0, outputLength);
343346
}
@@ -433,4 +436,292 @@ public static int encodeNoNewline(byte[] in, byte[] out) {
433436
}
434437
return outIndex;
435438
}
439+
440+
/// SIMD-optimized Base64 encoding with explicit offsets and caller scratch.
441+
/// Scratch layout: a single SIMD-allocated `int[]` buffer of at least 192 ints.
442+
///
443+
/// Usage example:
444+
/// ```java
445+
/// Simd simd = Simd.get();
446+
/// byte[] input = simd.allocByte(data.length);
447+
/// System.arraycopy(data, 0, input, 0, data.length);
448+
/// byte[] output = simd.allocByte(((data.length + 2) / 3) * 4);
449+
/// int[] scratch = simd.allocInt(192);
450+
/// int written = Base64.encodeNoNewlineSimd(input, 0, input.length, output, 0, scratch);
451+
/// ```
452+
@DisableDebugInfo
453+
@DisableNullChecksAndArrayBoundsChecks
454+
public static int encodeNoNewlineSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset, int[] scratch) {
455+
Simd simd = Simd.get();
456+
int outputLength = ((inLength + 2) / 3) * 4;
457+
if (out.length - outOffset < outputLength) {
458+
throw new IllegalArgumentException("Output buffer too small for encoded data");
459+
}
460+
if (inLength == 0) {
461+
return 0;
462+
}
463+
requireScratch(scratch);
464+
requireSimdApiArrays(simd, in, out, scratch);
465+
466+
final int b0 = 0;
467+
final int b1 = b0 + SIMD_LANES;
468+
final int b2 = b1 + SIMD_LANES;
469+
final int s0 = b2 + SIMD_LANES;
470+
final int s1 = s0 + SIMD_LANES;
471+
final int s2 = s1 + SIMD_LANES;
472+
final int s3 = s2 + SIMD_LANES;
473+
final int t0 = s3 + SIMD_LANES;
474+
final int t1 = t0 + SIMD_LANES;
475+
final int c3 = t1 + SIMD_LANES;
476+
final int c15 = c3 + SIMD_LANES;
477+
final int c63 = c15 + SIMD_LANES;
478+
479+
for (int lane = 0; lane < SIMD_LANES; lane++) {
480+
scratch[c3 + lane] = 3;
481+
scratch[c15 + lane] = 15;
482+
scratch[c63 + lane] = 63;
483+
}
484+
485+
int end = inOffset + inLength - (inLength % 3);
486+
int simdEnd = end - ((end - inOffset) % 48);
487+
int inIndex = inOffset;
488+
int outIndex = outOffset;
489+
for (; inIndex < simdEnd; inIndex += 48) {
490+
for (int lane = 0; lane < SIMD_LANES; lane++) {
491+
int src = inIndex + lane * 3;
492+
scratch[b0 + lane] = in[src] & 0xff;
493+
scratch[b1 + lane] = in[src + 1] & 0xff;
494+
scratch[b2 + lane] = in[src + 2] & 0xff;
495+
}
496+
497+
simd.shrLogical(scratch, b0, 2, scratch, s0, SIMD_LANES);
498+
simd.and(scratch, b0, scratch, c3, scratch, t0, SIMD_LANES);
499+
simd.shl(scratch, t0, 4, scratch, t0, SIMD_LANES);
500+
simd.shrLogical(scratch, b1, 4, scratch, t1, SIMD_LANES);
501+
simd.or(scratch, t0, scratch, t1, scratch, s1, SIMD_LANES);
502+
simd.and(scratch, b1, scratch, c15, scratch, t0, SIMD_LANES);
503+
simd.shl(scratch, t0, 2, scratch, t0, SIMD_LANES);
504+
simd.shrLogical(scratch, b2, 6, scratch, t1, SIMD_LANES);
505+
simd.or(scratch, t0, scratch, t1, scratch, s2, SIMD_LANES);
506+
simd.and(scratch, b2, scratch, c63, scratch, s3, SIMD_LANES);
507+
508+
for (int lane = 0; lane < SIMD_LANES; lane++) {
509+
out[outIndex++] = map[scratch[s0 + lane]];
510+
out[outIndex++] = map[scratch[s1 + lane]];
511+
out[outIndex++] = map[scratch[s2 + lane]];
512+
out[outIndex++] = map[scratch[s3 + lane]];
513+
}
514+
}
515+
516+
for (; inIndex < end; inIndex += 3) {
517+
int x0 = in[inIndex] & 0xff;
518+
int x1 = in[inIndex + 1] & 0xff;
519+
int x2 = in[inIndex + 2] & 0xff;
520+
out[outIndex++] = map[x0 >> 2];
521+
out[outIndex++] = map[((x0 & 0x03) << 4) | (x1 >> 4)];
522+
out[outIndex++] = map[((x1 & 0x0f) << 2) | (x2 >> 6)];
523+
out[outIndex++] = map[x2 & 0x3f];
524+
}
525+
526+
switch (inOffset + inLength - end) {
527+
case 1: {
528+
int x0 = in[end] & 0xff;
529+
out[outIndex++] = map[x0 >> 2];
530+
out[outIndex++] = map[(x0 & 0x03) << 4];
531+
out[outIndex++] = '=';
532+
out[outIndex++] = '=';
533+
break;
534+
}
535+
case 2: {
536+
int x0 = in[end] & 0xff;
537+
int x1 = in[end + 1] & 0xff;
538+
out[outIndex++] = map[x0 >> 2];
539+
out[outIndex++] = map[((x0 & 0x03) << 4) | (x1 >> 4)];
540+
out[outIndex++] = map[(x1 & 0x0f) << 2];
541+
out[outIndex++] = '=';
542+
break;
543+
}
544+
default:
545+
break;
546+
}
547+
return outputLength;
548+
}
549+
550+
/// SIMD-optimized Base64 decoding for no-whitespace input.
551+
/// Scratch layout: a single SIMD-allocated `int[]` buffer of at least 192 ints.
552+
///
553+
/// Returns decoded bytes written, or `-1` for invalid input.
554+
///
555+
/// Usage example:
556+
/// ```java
557+
/// Simd simd = Simd.get();
558+
/// byte[] encoded = simd.allocByte(base64Bytes.length);
559+
/// System.arraycopy(base64Bytes, 0, encoded, 0, base64Bytes.length);
560+
/// byte[] decoded = simd.allocByte((encoded.length / 4) * 3);
561+
/// int[] scratch = simd.allocInt(192);
562+
/// int written = Base64.decodeNoWhitespaceSimd(encoded, 0, encoded.length, decoded, 0, scratch);
563+
/// ```
564+
@DisableDebugInfo
565+
@DisableNullChecksAndArrayBoundsChecks
566+
public static int decodeNoWhitespaceSimd(byte[] in, int inOffset, int inLength, byte[] out, int outOffset, int[] scratch) {
567+
if ((inLength & 0x3) != 0) {
568+
return -1;
569+
}
570+
int pad = 0;
571+
if (inLength > 0 && in[inOffset + inLength - 1] == '=') {
572+
pad++;
573+
if (inLength > 1 && in[inOffset + inLength - 2] == '=') {
574+
pad++;
575+
}
576+
}
577+
if (pad > 2) {
578+
return -1;
579+
}
580+
int outLength = (inLength / 4) * 3 - pad;
581+
if (outLength <= 0) {
582+
return 0;
583+
}
584+
if (out.length - outOffset < outLength) {
585+
throw new IllegalArgumentException("Output buffer too small for decoded data");
586+
}
587+
588+
requireScratch(scratch);
589+
Simd simd = Simd.get();
590+
requireSimdApiArrays(simd, in, out, scratch);
591+
592+
final int q0 = 0;
593+
final int q1 = q0 + SIMD_LANES;
594+
final int q2 = q1 + SIMD_LANES;
595+
final int q3 = q2 + SIMD_LANES;
596+
final int o0 = q3 + SIMD_LANES;
597+
final int o1 = o0 + SIMD_LANES;
598+
final int o2 = o1 + SIMD_LANES;
599+
final int t0 = o2 + SIMD_LANES;
600+
final int t1 = t0 + SIMD_LANES;
601+
final int c3 = t1 + SIMD_LANES;
602+
final int c15 = c3 + SIMD_LANES;
603+
604+
for (int lane = 0; lane < SIMD_LANES; lane++) {
605+
scratch[c3 + lane] = 3;
606+
scratch[c15 + lane] = 15;
607+
}
608+
609+
int fullLen = inLength - (pad > 0 ? 4 : 0);
610+
int simdFullLen = fullLen - (fullLen % 64);
611+
int inIndex = inOffset;
612+
int outIndex = outOffset;
613+
int endVector = inOffset + simdFullLen;
614+
for (; inIndex < endVector; inIndex += 64) {
615+
for (int lane = 0; lane < SIMD_LANES; lane++) {
616+
int src = inIndex + lane * 4;
617+
int d0 = decodeMapInt[in[src] & 0xff];
618+
int d1 = decodeMapInt[in[src + 1] & 0xff];
619+
int d2 = decodeMapInt[in[src + 2] & 0xff];
620+
int d3 = decodeMapInt[in[src + 3] & 0xff];
621+
if ((d0 | d1 | d2 | d3) < 0) {
622+
return -1;
623+
}
624+
scratch[q0 + lane] = d0;
625+
scratch[q1 + lane] = d1;
626+
scratch[q2 + lane] = d2;
627+
scratch[q3 + lane] = d3;
628+
}
629+
630+
simd.shl(scratch, q0, 2, scratch, o0, SIMD_LANES);
631+
simd.shrLogical(scratch, q1, 4, scratch, t0, SIMD_LANES);
632+
simd.or(scratch, o0, scratch, t0, scratch, o0, SIMD_LANES);
633+
simd.and(scratch, q1, scratch, c15, scratch, t0, SIMD_LANES);
634+
simd.shl(scratch, t0, 4, scratch, t0, SIMD_LANES);
635+
simd.shrLogical(scratch, q2, 2, scratch, t1, SIMD_LANES);
636+
simd.or(scratch, t0, scratch, t1, scratch, o1, SIMD_LANES);
637+
simd.and(scratch, q2, scratch, c3, scratch, t0, SIMD_LANES);
638+
simd.shl(scratch, t0, 6, scratch, t0, SIMD_LANES);
639+
simd.or(scratch, t0, scratch, q3, scratch, o2, SIMD_LANES);
640+
641+
for (int lane = 0; lane < SIMD_LANES; lane++) {
642+
out[outIndex++] = (byte)scratch[o0 + lane];
643+
out[outIndex++] = (byte)scratch[o1 + lane];
644+
out[outIndex++] = (byte)scratch[o2 + lane];
645+
}
646+
}
647+
648+
int fullEnd = inOffset + fullLen;
649+
for (; inIndex < fullEnd; inIndex += 4) {
650+
int c0 = in[inIndex] & 0xff;
651+
int c1 = in[inIndex + 1] & 0xff;
652+
int c2 = in[inIndex + 2] & 0xff;
653+
int c3v = in[inIndex + 3] & 0xff;
654+
int x0 = decodeMapInt[c0];
655+
int x1 = decodeMapInt[c1];
656+
int x2 = decodeMapInt[c2];
657+
int x3 = decodeMapInt[c3v];
658+
if ((x0 | x1 | x2 | x3) < 0) {
659+
return -1;
660+
}
661+
int quantum = (x0 << 18) | (x1 << 12) | (x2 << 6) | x3;
662+
out[outIndex++] = (byte)((quantum >> 16) & 0xff);
663+
out[outIndex++] = (byte)((quantum >> 8) & 0xff);
664+
out[outIndex++] = (byte)(quantum & 0xff);
665+
}
666+
667+
if (pad == 0) {
668+
return outLength;
669+
}
670+
671+
int i = inOffset + inLength - 4;
672+
int c0 = in[i] & 0xff;
673+
int c1 = in[i + 1] & 0xff;
674+
int x0 = decodeMapInt[c0];
675+
int x1 = decodeMapInt[c1];
676+
if ((x0 | x1) < 0) {
677+
return -1;
678+
}
679+
out[outIndex++] = (byte)((x0 << 2) | (x1 >> 4));
680+
if (pad == 2) {
681+
return (in[i + 2] == '=' && in[i + 3] == '=') ? outLength : -1;
682+
}
683+
if (in[i + 3] != '=') {
684+
return -1;
685+
}
686+
int x2 = decodeMapInt[in[i + 2] & 0xff];
687+
if (x2 < 0) {
688+
return -1;
689+
}
690+
out[outIndex] = (byte)((x1 << 4) | (x2 >> 2));
691+
return outLength;
692+
}
693+
694+
/// Convenience overload for `encodeNoNewlineSimd(byte[], int, int, byte[], int, int[])`
695+
/// using zero offsets.
696+
public static int encodeNoNewlineSimd(byte[] in, byte[] out, int[] scratch) {
697+
return encodeNoNewlineSimd(in, 0, in.length, out, 0, scratch);
698+
}
699+
700+
/// Convenience overload for `decodeNoWhitespaceSimd(byte[], int, int, byte[], int, int[])`
701+
/// using zero offsets.
702+
public static int decodeNoWhitespaceSimd(byte[] in, int len, byte[] out, int[] scratch) {
703+
return decodeNoWhitespaceSimd(in, 0, len, out, 0, scratch);
704+
}
705+
706+
private static void requireScratch(int[] scratch) {
707+
if (scratch == null || scratch.length < SIMD_SCRATCH_INTS) {
708+
throw new IllegalArgumentException("scratch must be an int[] allocated with Simd.allocInt(192) or larger");
709+
}
710+
}
711+
712+
private static void requireSimdApiArrays(Simd simd, byte[] in, byte[] out, int[] scratch) {
713+
simd.unpackUnsignedByteToInt(in, scratch, 0, 0);
714+
simd.packIntToByteTruncate(scratch, out, 0, 0);
715+
}
716+
717+
private static byte[] allocByteMaybeSimd(int size) {
718+
if (size <= 0) {
719+
return new byte[0];
720+
}
721+
Simd simd = Simd.get();
722+
if (simd.isSupported() && size >= 16) {
723+
return simd.allocByte(size);
724+
}
725+
return new byte[size];
726+
}
436727
}

0 commit comments

Comments
 (0)