Skip to content

Commit 22eb74f

Browse files
authored
Add ParparVM annotations to disable debug and safety checks for hot Base64 methods (#4732)
* Add ParparVM method annotations to disable debug and safety checks * Optimize debug-disabled methods by skipping local debug metadata * Add fast method stack path and static init call fast-check * Fix static initializer fast-path guard to use class initialized flag * Add primitive fast-frame path and inline fast return release * Add missing exception constructor declarations for fast stack init * Revert Base64 annotation opt-in after no benchmark gain * Refine fast primitive frame init to avoid slow per-slot loop * Restore Base64 annotation-based translator optimizations * Optimize Base64 decode map lookups for CN1 runtime * Tighten Base64 decode locals to aid C optimizer * Harden Base64 fast decode writes with explicit bounds guards * Revert decode-local tweaks and enforce Release builds in perf tests * Document ParparVM performance hints and tradeoffs in developer guide
1 parent 64fe9df commit 22eb74f

13 files changed

Lines changed: 516 additions & 37 deletions

File tree

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
package com.codename1.annotations;
2+
3+
import java.lang.annotation.ElementType;
4+
import java.lang.annotation.Retention;
5+
import java.lang.annotation.RetentionPolicy;
6+
import java.lang.annotation.Target;
7+
8+
/**
9+
* Marks a method so ParparVM omits emitted debug line information.
10+
*/
11+
@Retention(RetentionPolicy.CLASS)
12+
@Target(ElementType.METHOD)
13+
public @interface DisableDebugInfo {
14+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
package com.codename1.annotations;
2+
3+
import java.lang.annotation.ElementType;
4+
import java.lang.annotation.Retention;
5+
import java.lang.annotation.RetentionPolicy;
6+
import java.lang.annotation.Target;
7+
8+
/**
9+
* Marks a method so ParparVM omits emitted null and array bounds checks.
10+
*/
11+
@Retention(RetentionPolicy.CLASS)
12+
@Target(ElementType.METHOD)
13+
public @interface DisableNullChecksAndArrayBoundsChecks {
14+
}

CodenameOne/src/com/codename1/util/Base64.java

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919

2020
package com.codename1.util;
2121

22+
import com.codename1.annotations.DisableDebugInfo;
23+
import com.codename1.annotations.DisableNullChecksAndArrayBoundsChecks;
24+
25+
2226
/// This class implements Base64 encoding/decoding functionality
2327
/// as specified in RFC 2045 (http://www.ietf.org/rfc/rfc2045.txt).
2428
public abstract class Base64 {
@@ -34,18 +38,25 @@ public abstract class Base64 {
3438
'4', '5', '6', '7', '8', '9', '+', '/'};
3539

3640
private static final byte[] decodeMap = new byte[256];
41+
private static final int[] decodeMapInt = new int[256];
3742

3843
static {
3944
for (int i = 0; i < decodeMap.length; i++) {
4045
decodeMap[i] = (byte) DECODE_INVALID;
46+
decodeMapInt[i] = DECODE_INVALID;
4147
}
4248
for (int i = 0; i < map.length; i++) {
4349
decodeMap[map[i] & 0xff] = (byte) i;
50+
decodeMapInt[map[i] & 0xff] = i;
4451
}
4552
decodeMap['\n'] = (byte) DECODE_WHITESPACE;
4653
decodeMap['\r'] = (byte) DECODE_WHITESPACE;
4754
decodeMap[' '] = (byte) DECODE_WHITESPACE;
4855
decodeMap['\t'] = (byte) DECODE_WHITESPACE;
56+
decodeMapInt['\n'] = DECODE_WHITESPACE;
57+
decodeMapInt['\r'] = DECODE_WHITESPACE;
58+
decodeMapInt[' '] = DECODE_WHITESPACE;
59+
decodeMapInt['\t'] = DECODE_WHITESPACE;
4960
}
5061

5162
public static byte[] decode(byte[] in) {
@@ -89,6 +100,8 @@ public static byte[] decode(byte[] in, int len) {
89100
* @param out destination buffer
90101
* @return decoded length, or {@code -1} for invalid Base64
91102
*/
103+
@DisableDebugInfo
104+
@DisableNullChecksAndArrayBoundsChecks
92105
public static int decode(byte[] in, int len, byte[] out) {
93106
if (len == 0) {
94107
return 0;
@@ -103,7 +116,7 @@ public static int decode(byte[] in, int len, byte[] out) {
103116
int end = len;
104117
while (end > 0) {
105118
int chr = in[end - 1] & 0xff;
106-
if (decodeMap[chr] == DECODE_WHITESPACE) {
119+
if (decodeMapInt[chr] == DECODE_WHITESPACE) {
107120
end--;
108121
continue;
109122
}
@@ -121,7 +134,7 @@ public static int decode(byte[] in, int len, byte[] out) {
121134
if (chr == '=') {
122135
break;
123136
}
124-
int value = decodeMap[chr];
137+
int value = decodeMapInt[chr];
125138
if (value == DECODE_WHITESPACE) {
126139
continue;
127140
}
@@ -148,7 +161,7 @@ public static int decode(byte[] in, int len, byte[] out) {
148161
if (chr == '=') {
149162
break;
150163
}
151-
int bits = decodeMap[chr];
164+
int bits = decodeMapInt[chr];
152165
if (bits == DECODE_WHITESPACE) {
153166
continue;
154167
}
@@ -184,6 +197,8 @@ public static int decode(byte[] in, byte[] out) {
184197
return decode(in, in.length, out);
185198
}
186199

200+
@DisableDebugInfo
201+
@DisableNullChecksAndArrayBoundsChecks
187202
private static int decodeNoWhitespace(byte[] in, int len, byte[] out) {
188203
if ((len & 0x3) != 0) {
189204
return -1;
@@ -207,8 +222,8 @@ private static int decodeNoWhitespace(byte[] in, int len, byte[] out) {
207222
throw new IllegalArgumentException("Output buffer too small for decoded data");
208223
}
209224
int outIndex = 0;
210-
byte[] decodeMapLocal = decodeMap;
211225
int fullLen = len - (pad > 0 ? 4 : 0);
226+
int[] decodeMapLocal = decodeMapInt;
212227

213228
for (int i = 0; i < fullLen; i += 4) {
214229
int c0 = in[i] & 0xff;
@@ -334,6 +349,8 @@ public static String encodeNoNewline(byte[] in) {
334349
* @param out destination buffer
335350
* @return number of bytes written to {@code out}
336351
*/
352+
@DisableDebugInfo
353+
@DisableNullChecksAndArrayBoundsChecks
337354
public static int encodeNoNewline(byte[] in, byte[] out) {
338355
int inputLength = in.length;
339356
int outputLength = ((inputLength + 2) / 3) * 4;

docs/developer-guide/performance.asciidoc

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,60 @@ The simulator contains some tools to measure performance overhead of a specific
4545
* *On some platforms mutable images are slow* - mutable images are images you can draw on (using `getGraphics()`). On some platforms they perform quite badly (e.g. iOS) and should generally be avoided. You can check if mutable images are fast in a platform using `Display.areMutableImagesFast()`
4646
* * Make components either transparent or opaque * - a translucent component must paint it's parent every time. This can be expensive. An opaque component might have margins that would require that we paint the parent so there is often overdraw in such cases (overdraw means the same pixel being painted twice).
4747

48+
==== ParparVM Native Translation Performance Hints
49+
50+
For ParparVM-generated native code, we now support method-level optimization hints via annotations. These can provide very good wins in hot code paths, but they come with tradeoffs and should be applied surgically.
51+
52+
===== Method-level codegen hints
53+
54+
* `@DisableDebugInfo` +
55+
Suppresses generated line/debug metadata for the annotated method.
56+
This can reduce generated C size and remove some per-instruction debug overhead.
57+
58+
* `@DisableNullChecksAndArrayBoundsChecks` +
59+
Suppresses generated null and array-bounds checks for the annotated method.
60+
This can significantly reduce branch-heavy code in tight loops.
61+
62+
TIP: Use these only on methods that are both performance-critical and well-covered by tests. These annotations intentionally trade runtime safety diagnostics for speed.
63+
64+
===== Fast method-stack path
65+
66+
The translator can emit a fast method-stack prologue/epilogue (`DEFINE_METHOD_STACK_FAST_*` and `CN1_FAST_RETURN_RELEASE`) for methods that meet strict safety criteria.
67+
68+
In practice, this tends to help for:
69+
70+
* Small, hot methods.
71+
* Methods without monitor usage / exception-heavy flow.
72+
* Methods with straightforward control flow and low instruction complexity.
73+
74+
Tradeoffs:
75+
76+
* Overly broad fast-path eligibility can regress performance if extra branches or memory writes are introduced.
77+
* Primitive-only fast-frame variants may not always outperform a straightforward full clear on all targets/compilers.
78+
79+
TIP: Benchmark representative workloads after enabling fast-stack behavior. Keep eligibility conservative and expand only where measurement shows consistent gains.
80+
81+
===== Base64-style hot-loop guidelines
82+
83+
For low-level loops (e.g. Base64 encode/decode):
84+
85+
* Prefer simple loop bodies with predictable branches.
86+
* Cache decode/lookup tables in primitive arrays (`int[]` lookup tables can reduce per-iteration conversion overhead).
87+
* Avoid adding “defensive” branches in the inner-most loop unless they are required for correctness in production inputs.
88+
89+
===== Build configuration matters
90+
91+
When benchmarking translator output, ensure native projects are compiled with optimization enabled (e.g. CMake `Release` builds). Debug/default builds can hide improvements or produce misleading regressions.
92+
93+
If you are using the integration test harness, make sure CMake is configured with:
94+
95+
[source]
96+
----
97+
-DCMAKE_BUILD_TYPE=Release
98+
----
99+
100+
Without this setting, comparison between Java and ParparVM native output is often noisy and can lead to incorrect optimization conclusions.
101+
48102
=== Performance Monitor
49103

50104
The Performance Monitor tool can be accessible via the #Simulator# -> #Performance Monitor# menu option in the simulator. This launches the following UI that can help you improve application performance:

vm/ByteCodeTranslator/src/cn1_globals.h

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -879,6 +879,8 @@ extern void throwException(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT exceptionArg);
879879
extern JAVA_INT throwException_R_int(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT exceptionArg);
880880
extern JAVA_BOOLEAN throwException_R_boolean(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT exceptionArg);
881881
extern JAVA_OBJECT __NEW_java_lang_NullPointerException(CODENAME_ONE_THREAD_STATE);
882+
extern JAVA_OBJECT __NEW_INSTANCE_java_lang_NullPointerException(CODENAME_ONE_THREAD_STATE);
883+
extern JAVA_OBJECT __NEW_INSTANCE_java_lang_StackOverflowError(CODENAME_ONE_THREAD_STATE);
882884
extern JAVA_OBJECT __NEW_java_lang_ArrayIndexOutOfBoundsException(CODENAME_ONE_THREAD_STATE);
883885
extern JAVA_VOID java_lang_ArrayIndexOutOfBoundsException___INIT_____int(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT __cn1ThisObject, JAVA_INT __cn1Arg1);
884886
extern void throwArrayIndexOutOfBoundsException(CODENAME_ONE_THREAD_STATE, int index);
@@ -1129,6 +1131,31 @@ extern JAVA_OBJECT newStringFromCString(CODENAME_ONE_THREAD_STATE, const char *s
11291131
extern void initConstantPool();
11301132

11311133
extern void initMethodStack(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT __cn1ThisObject, int stackSize, int localsStackSize, int classNameId, int methodNameId);
1134+
static inline void cn1_init_method_stack_fast(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT __cn1ThisObject, int stackSize, int localsStackSize, JAVA_BOOLEAN fullClear) {
1135+
#ifdef CN1_INCLUDE_NPE_CHECKS
1136+
if(__cn1ThisObject == JAVA_NULL) {
1137+
THROW_NULL_POINTER_EXCEPTION();
1138+
}
1139+
#endif
1140+
if (threadStateData->callStackOffset >= CN1_STACK_OVERFLOW_CALL_DEPTH_LIMIT - 1) {
1141+
throwException(threadStateData, __NEW_INSTANCE_java_lang_StackOverflowError(threadStateData));
1142+
return;
1143+
}
1144+
if (fullClear) {
1145+
memset(&threadStateData->threadObjectStack[threadStateData->threadObjectStackOffset], 0,
1146+
sizeof(struct elementStruct) * (localsStackSize + stackSize));
1147+
} else {
1148+
/*
1149+
* Primitive-only fast frames intentionally use the same memset strategy.
1150+
* A per-slot type-only loop was measurably slower in benchmarks and did
1151+
* not improve generated-code performance.
1152+
*/
1153+
memset(&threadStateData->threadObjectStack[threadStateData->threadObjectStackOffset], 0,
1154+
sizeof(struct elementStruct) * (localsStackSize + stackSize));
1155+
}
1156+
threadStateData->threadObjectStackOffset += localsStackSize + stackSize;
1157+
threadStateData->callStackOffset++;
1158+
}
11321159

11331160
// we need to zero out the values with memset otherwise we will run into a problem
11341161
// when invoking release on pre-existing object which might be garbage
@@ -1150,6 +1177,46 @@ extern void initMethodStack(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT __cn1ThisObje
11501177
const int currentCodenameOneCallStackOffset = threadStateData->callStackOffset;\
11511178
int methodBlockOffset = threadStateData->tryBlockOffset;
11521179

1180+
#define DEFINE_METHOD_STACK_FAST_REF(stackSize, localsStackSize, spPosition) \
1181+
const int cn1LocalsBeginInThread = threadStateData->threadObjectStackOffset; \
1182+
struct elementStruct* locals = &threadStateData->threadObjectStack[cn1LocalsBeginInThread]; \
1183+
struct elementStruct* stack = &threadStateData->threadObjectStack[threadStateData->threadObjectStackOffset + localsStackSize]; \
1184+
struct elementStruct* SP = &stack[spPosition]; \
1185+
cn1_init_method_stack_fast(threadStateData, (JAVA_OBJECT)1, stackSize, localsStackSize, JAVA_TRUE); \
1186+
const int currentCodenameOneCallStackOffset = threadStateData->callStackOffset;\
1187+
int methodBlockOffset = threadStateData->tryBlockOffset;
1188+
1189+
#define DEFINE_INSTANCE_METHOD_STACK_FAST_REF(stackSize, localsStackSize, spPosition) \
1190+
const int cn1LocalsBeginInThread = threadStateData->threadObjectStackOffset; \
1191+
struct elementStruct* locals = &threadStateData->threadObjectStack[cn1LocalsBeginInThread]; \
1192+
struct elementStruct* stack = &threadStateData->threadObjectStack[threadStateData->threadObjectStackOffset + localsStackSize]; \
1193+
struct elementStruct* SP = &stack[spPosition]; \
1194+
cn1_init_method_stack_fast(threadStateData, __cn1ThisObject, stackSize, localsStackSize, JAVA_TRUE); \
1195+
const int currentCodenameOneCallStackOffset = threadStateData->callStackOffset;\
1196+
int methodBlockOffset = threadStateData->tryBlockOffset;
1197+
1198+
#define DEFINE_METHOD_STACK_FAST_PRIMITIVE(stackSize, localsStackSize, spPosition) \
1199+
const int cn1LocalsBeginInThread = threadStateData->threadObjectStackOffset; \
1200+
struct elementStruct* locals = &threadStateData->threadObjectStack[cn1LocalsBeginInThread]; \
1201+
struct elementStruct* stack = &threadStateData->threadObjectStack[threadStateData->threadObjectStackOffset + localsStackSize]; \
1202+
struct elementStruct* SP = &stack[spPosition]; \
1203+
cn1_init_method_stack_fast(threadStateData, (JAVA_OBJECT)1, stackSize, localsStackSize, JAVA_FALSE); \
1204+
const int currentCodenameOneCallStackOffset = threadStateData->callStackOffset;\
1205+
int methodBlockOffset = threadStateData->tryBlockOffset;
1206+
1207+
#define DEFINE_INSTANCE_METHOD_STACK_FAST_PRIMITIVE(stackSize, localsStackSize, spPosition) \
1208+
const int cn1LocalsBeginInThread = threadStateData->threadObjectStackOffset; \
1209+
struct elementStruct* locals = &threadStateData->threadObjectStack[cn1LocalsBeginInThread]; \
1210+
struct elementStruct* stack = &threadStateData->threadObjectStack[threadStateData->threadObjectStackOffset + localsStackSize]; \
1211+
struct elementStruct* SP = &stack[spPosition]; \
1212+
cn1_init_method_stack_fast(threadStateData, __cn1ThisObject, stackSize, localsStackSize, JAVA_FALSE); \
1213+
const int currentCodenameOneCallStackOffset = threadStateData->callStackOffset;\
1214+
int methodBlockOffset = threadStateData->tryBlockOffset;
1215+
1216+
#define CN1_FAST_RETURN_RELEASE() \
1217+
threadStateData->threadObjectStackOffset = cn1LocalsBeginInThread; \
1218+
threadStateData->callStackOffset--;
1219+
11531220

11541221
#if defined(__APPLE__) && defined(__OBJC__)
11551222
@class NSString;

0 commit comments

Comments
 (0)