Skip to content

Commit d6e81bd

Browse files
authored
Merge branch 'main' into main
2 parents b55368e + 9e2a5f9 commit d6e81bd

85 files changed

Lines changed: 2523 additions & 720 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.circleci/continue_config.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ jobs:
9393
- run:
9494
name: Run linters and code style checks
9595
command: make py-style
96+
- run:
97+
name: Exercise the benchmarks
98+
command: make benchmark-ci
9699
- run:
97100
name: Run cicd tests
98101
command: make cicd-test

.prettierignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ vscode/extension/.vscode-test/
2323

2424
sqlmesh
2525
docs
26-
tests
26+
/tests/**
2727
examples
2828
posts
2929
.circleci

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,3 +181,6 @@ vscode-generate-openapi:
181181
python3 web/server/openapi.py --output vscode/openapi.json
182182
pnpm run fmt
183183
cd vscode/react && pnpm run generate:api
184+
185+
benchmark-ci:
186+
python benchmarks/lsp_render_model_bench.py --debug-single-value
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
#!/usr/bin/env python
2+
3+
import asyncio
4+
import pyperf
5+
import os
6+
import logging
7+
from pathlib import Path
8+
from lsprotocol import types
9+
10+
from sqlmesh.lsp.custom import RenderModelRequest, RENDER_MODEL_FEATURE
11+
from sqlmesh.lsp.uri import URI
12+
from pygls.client import JsonRPCClient
13+
14+
# Suppress debug logging during benchmark
15+
logging.getLogger().setLevel(logging.WARNING)
16+
17+
18+
class LSPClient(JsonRPCClient):
19+
"""A custom LSP client for benchmarking."""
20+
21+
def __init__(self):
22+
super().__init__()
23+
self.render_model_result = None
24+
self.initialized = asyncio.Event()
25+
26+
# Register handlers for notifications we expect from the server
27+
@self.feature(types.WINDOW_SHOW_MESSAGE)
28+
def handle_show_message(_):
29+
# Silently ignore show message notifications during benchmark
30+
pass
31+
32+
@self.feature(types.WINDOW_LOG_MESSAGE)
33+
def handle_log_message(_):
34+
# Silently ignore log message notifications during benchmark
35+
pass
36+
37+
async def initialize_server(self):
38+
"""Send initialization request to server."""
39+
# Get the sushi example directory
40+
sushi_dir = Path(__file__).parent.parent / "examples" / "sushi"
41+
42+
response = await self.protocol.send_request_async(
43+
types.INITIALIZE,
44+
types.InitializeParams(
45+
process_id=os.getpid(),
46+
root_uri=URI.from_path(sushi_dir).value,
47+
capabilities=types.ClientCapabilities(),
48+
workspace_folders=[
49+
types.WorkspaceFolder(
50+
uri=URI.from_path(sushi_dir).value,
51+
name="sushi"
52+
)
53+
]
54+
)
55+
)
56+
57+
# Send initialized notification
58+
self.protocol.notify(types.INITIALIZED, types.InitializedParams())
59+
self.initialized.set()
60+
return response
61+
62+
63+
async def benchmark_render_model_async(client: LSPClient, model_path: Path):
64+
"""Benchmark the render_model request."""
65+
uri = URI.from_path(model_path).value
66+
67+
# Send render_model request
68+
result = await client.protocol.send_request_async(
69+
RENDER_MODEL_FEATURE,
70+
RenderModelRequest(textDocumentUri=uri)
71+
)
72+
73+
return result
74+
75+
76+
def benchmark_render_model(loops):
77+
"""Synchronous wrapper for the benchmark."""
78+
async def run():
79+
# Create client
80+
client = LSPClient()
81+
82+
# Start the SQLMesh LSP server as a subprocess
83+
await client.start_io("python", "-m", "sqlmesh.lsp.main")
84+
85+
# Initialize the server
86+
await client.initialize_server()
87+
88+
# Get a model file to test with
89+
sushi_dir = Path(__file__).parent.parent / "examples" / "sushi"
90+
model_path = sushi_dir / "models" / "customers.sql"
91+
92+
# Warm up
93+
await benchmark_render_model_async(client, model_path)
94+
95+
# Run benchmark
96+
t0 = pyperf.perf_counter()
97+
for _ in range(loops):
98+
await benchmark_render_model_async(client, model_path)
99+
dt = pyperf.perf_counter() - t0
100+
101+
# Clean up
102+
await client.stop()
103+
104+
return dt
105+
106+
return asyncio.run(run())
107+
108+
109+
def main():
110+
runner = pyperf.Runner()
111+
runner.bench_time_func(
112+
"lsp_render_model",
113+
benchmark_render_model
114+
)
115+
116+
117+
if __name__ == "__main__":
118+
main()
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Security Overview
2+
3+
4+
At Tobiko, we treat security as a first-class citizen because we know how valuable your data assets are. Our team follows and executes security best practices across each layer of our product.
5+
6+
## Tobiko Cloud Standard Deployment
7+
8+
Our standard Tobiko Cloud deployment consists of several components that are each responsible for different parts of the product.
9+
10+
Below is a diagram of the components along with their descriptions.
11+
12+
![tobiko_cloud_standard_deployment](./tcloud_standard_deployment.png){ width=80% height=60% style="display: block; margin: 0 auto" }
13+
14+
- **Scheduler**: Orchestrates schedule cadence and hosts state metadata (code versions, logs, cost)
15+
- **Executor**: Applies code changes and runs SQL queries (actual data processing in SQL Engine) and Python models in proper DAG order.
16+
- **Gateway**: Stores authentication credentials for SQL Engine. Secured through encryption.
17+
- **SQL Engine**: Processes and stores data based on the above instructions within the **customer’s** environment.
18+
19+
## Tobiko Cloud Hybrid Deployment
20+
21+
For some customers, our hybrid deployment option is a great fit. It provides a seamless experience with Tobiko Cloud but within your own VPC and infrastructure.
22+
23+
In a hybrid deployment, Tobiko Cloud does not execute tasks directly with the engine. Instead, it passes tasks to the executors hosted in your environment, which then execute the tasks with the engine.
24+
25+
Executors are Docker containers that connect to both Tobiko Cloud and your SQL engine. They pull work tasks from the Tobiko Cloud scheduler and execute them with your SQL engine. This is a pull-only mechanism authenticated through an OAuth Client ID/Secret. Whitelist IPs in your network to allow reaching Tobiko Cloud IPs from the executor: 34.28.17.91, 34.136.27.153, 34.136.131.20
26+
27+
Below is a diagram of the components along with their description.
28+
29+
![tobiko_cloud_hybrid_deployment](./tcloud_hybrid_deployment.png){ width=80% height=60% style="display: block; margin: 0 auto" }
30+
31+
- **Scheduler**: Orchestrates schedule cadence and hosts state metadata (code versions, logs, cost). **Never pushes** instructions to executor.
32+
- **Executor**: Appplies code changes and runs SQL queries and Python models in proper DAG order (actual data processing in SQL Engine)
33+
- **Gateway**: Stores authentication credentials for SQL Engine. Secured through your secrets manager or Kubernetes Secrets.
34+
- **SQL Engine**: Processes and stores data based on the above instructions
35+
- **Executor -> Scheduler**: A pull-only mechanism for obtaining work tasks.
36+
- **Helm Chart**: For production environements, we provide a [Helm chart](../scheduler/hybrid_executors_helm.md) that includes robust configurability, secret management, and scaling options.
37+
- **Docker Compose**: For simpler environments or testing, we offer a [Docker Compose setup](../scheduler/hybrid_executors_docker_compose.md) to quickly deploy executors on any machine with Docker.
38+
39+
40+
41+
## Internal Code Practices
42+
43+
We enforce coding standards throughout Tobiko to write, maintain, and collaborate on code effectively. These practice ensure consistency, maintainability, reliability, and most importantly, trust.
44+
45+
A few key components of our internal code requirements:
46+
47+
- We used signed Git commits, required approvers, and signed Docker artifacts.
48+
- Each commit to a `main` branch must be approved by someone other than the author.
49+
- We sign commits and register the key with GitHub ([Github Docs](https://docs.github.com/en/authentication/managing-commit-signature-verification/signing-commits)).
50+
- Binaries are signed using cosign and OIDC for keyless ([Signing docs](https://docs.sigstore.dev/cosign/signing/overview/)).
51+
- Attestations are created to certify an image, enforced with GCP Binary Authorization ([Attestation docs](https://cloud.google.com/binary-authorization/docs/key-concepts#attestations)).
52+
- Encryption is a key feature of our security posture and is enforced at each stage of access. For example, the state database automatically encrypts all data. Credentials are also securely encrypted and stored.
53+
- We back up each state database nightly and before upgrades. These backups are stored for 14 days.
54+
55+
## Penetration Testing
56+
57+
At least once a year, Tobiko engages a third-party security firm to perform a penetration test. This test evaluates our systems by identifying and attempting to exploit known vulnerabilities, focusing on critical external and/or internal assets. A detailed report is available upon request.
58+
59+
60+
## Asset and Access Management
61+
62+
### How do we protect PGP keys?
63+
64+
If an employee loses their laptop, we don't need to get the old PGP key back because we can invalidate the key directly.
65+
66+
We use GitHub to sign code commits. At the time the code was committed, the PGP key was valid. When an employee loses their laptop, we will invalidate it, and they will regenerate a new key to use in future commits. The old commits are still valid because the PGP key was valid at the time the commit was made.
67+
68+
### How do we invalidate PGP keys if someone did steal it and could potentially use it?
69+
70+
We would revoke access for the GitHub user account associated with the compromised key and not give it access again until the old PGP key is deprecated and a new key issued.
71+
72+
### If someone steals a laptop, what's our continuity plan in protecting code?
73+
74+
- All employee devices are monitored for proper encryption and password policies.
75+
- Laptop protection is enforced through file encryption via Vanta.
76+
- Mandatory lock screen after a timeout.
77+
- We follow a formal IT asset disposal procedure to prevent key compromise through improper hardware disposal.
78+
- See above for PGP key protection.
79+
- Binaries are signed using Cosign and OIDC for keyless signing.
80+
56.5 KB
Loading
89.7 KB
Loading

docs/concepts/macros/macro_variables.md

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,11 +130,21 @@ SQLMesh provides additional predefined variables used to modify model behavior b
130130
* 'auditing' - The audit is being run.
131131
* 'testing' - The model query logic is being evaluated in the context of a unit test.
132132
* @gateway - A string value containing the name of the current [gateway](../../guides/connections.md).
133-
* @this_model - A string value containing the name of the physical table the model view selects from. Typically used to create [generic audits](../audits.md#generic-audits). In the case of [on_virtual_update statements](../models/sql_models.md#optional-on-virtual-update-statements) it contains the qualified view name instead.
134-
* Can be used in model definitions when SQLGlot cannot fully parse a statement and you need to reference the model's underlying physical table directly.
135-
* Can be passed as an argument to macros that access or interact with the underlying physical table.
133+
* @this_model - The physical table name that the model's view selects from. Typically used to create [generic audits](../audits.md#generic-audits). When used in [on_virtual_update statements](../models/sql_models.md#optional-on-virtual-update-statements), it contains the qualified view name instead.
136134
* @model_kind_name - A string value containing the name of the current model kind. Intended to be used in scenarios where you need to control the [physical properties in model defaults](../../reference/model_configuration.md#model-defaults).
137135

136+
!!! note "Embedding variables in strings"
137+
138+
Macro variable references sometimes use the curly brace syntax `@{variable}`, which serves a different purpose than the regular `@variable` syntax.
139+
140+
The curly brace syntax tells SQLMesh that the rendered string should be treated as an identifier, instead of simply replacing the macro variable value.
141+
142+
For example, if `variable` is defined as `@DEF(`variable`, foo.bar)`, then `@variable` produces `foo.bar`, while `@{variable}` produces `"foo.bar"`. This is because SQLMesh converts `foo.bar` into an identifier, using double quotes to correctly include the `.` character in the identifier name.
143+
144+
In practice, `@{variable}` is most commonly used to interpolate a value within an identifier, e.g., `@{variable}_suffix`, whereas `@variable` is used to do plain substitutions for string literals.
145+
146+
Learn more [above](#embedding-variables-in-strings).
147+
138148
#### Before all and after all variables
139149

140150
The following variables are also available in [`before_all` and `after_all` statements](../../guides/configuration.md#before_all-and-after_all-statements), as well as in macros invoked within them.

docs/concepts/macros/sqlmesh_macros.md

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,59 @@ It uses the following five step approach to accomplish this:
3838

3939
5. Modify the semantic representation of the SQL query with the substituted variable values from (3) and functions from (4).
4040

41+
### Embedding variables in strings
42+
43+
SQLMesh always incorporates macro variable values into the semantic representation of a SQL query (step 5 above). To do that, it infers the role each macro variable value plays in the query.
44+
45+
For context, two commonly used types of string in SQL are:
46+
47+
- String literals, which represent text values and are surrounded by single quotes, such as `'the_string'`
48+
- Identifiers, which reference database objects like column, table, alias, and function names
49+
- They may be unquoted or quoted with double quotes, backticks, or brackets, depending on the SQL dialect
50+
51+
In a normal query, SQLMesh can easily determine which role a given string is playing. However, it is more difficult if a macro variable is embedded directly into a string - especially if the string is in the `MODEL` block (and not the query itself).
52+
53+
For example, consider a project that defines a [gateway variable](#gateway-variables) named `gateway_var`. The project includes a model that references `@gateway_var` as part of the schema in the model's `name`, which is a SQL *identifier*.
54+
55+
This is how we might try to write the model:
56+
57+
``` sql title="Incorrectly rendered to string literal"
58+
MODEL (
59+
name the_@gateway_var_schema.table
60+
);
61+
```
62+
63+
From SQLMesh's perspective, the model schema is the combination of three sub-strings: `the_`, the value of `@gateway_var`, and `_schema`.
64+
65+
SQLMesh will concatenate those strings, but it does not have the context to know that it is building a SQL identifier and will return a string literal.
66+
67+
To provide the context SQLMesh needs, you must add curly braces to the macro variable reference: `@{gateway_var}` instead of `@gateway_var`:
68+
69+
``` sql title="Correctly rendered to identifier"
70+
MODEL (
71+
name the_@{gateway_var}_schema.table
72+
);
73+
```
74+
75+
The curly braces let SQLMesh know that it should treat the string as a SQL identifier, which it will then quote based on the SQL dialect's quoting rules.
76+
77+
The most common use of the curly brace syntax is embedding macro variables into strings, it can also be used to differentiate string literals and identifiers in SQL queries. For example, consider a macro variable `my_variable` whose value is `col`.
78+
79+
If we `SELECT` this value with regular macro syntax, it will render to a string literal:
80+
81+
``` sql
82+
SELECT @my_variable AS the_column; -- renders to SELECT 'col' AS the_column
83+
```
84+
85+
`'col'` is surrounded with single quotes, and the SQL engine will use that string as the column's data value.
86+
87+
If we use curly braces, SQLMesh will know that we want to use the rendered string as an identifier:
88+
89+
``` sql
90+
SELECT @{my_variable} AS the_column; -- renders to SELECT col AS the_column
91+
```
92+
93+
`col` is not surrounded with single quotes, and the SQL engine will determine that the query is referencing a column or other object named `col`.
4194

4295
## User-defined variables
4396

@@ -174,6 +227,8 @@ SELECT
174227
FROM @customer.some_source
175228
```
176229

230+
Note the use of both regular `@field_a` and curly brace syntax `@{field_b}` macro variable references in the model query. Learn more [above](#embedding-variables-in-strings)
231+
177232
Blueprint variables can be accessed using the syntax shown above, or through the `@BLUEPRINT_VAR()` macro function, which also supports specifying default values in case the variable is undefined (similar to `@VAR()`).
178233

179234
### Local variables
@@ -448,7 +503,13 @@ FROM table
448503

449504
This syntax works regardless of whether the array values are quoted or not.
450505

451-
NOTE: SQLMesh macros support placing macro values at the end of a column name simply using `column_@x`. However if you wish to substitute the variable anywhere else in the identifier, you need to use the more explicit substitution syntax `@{}`. This avoids ambiguity. These are valid uses: `@{x}_column` or `my_@{x}_column`.
506+
!!! note "Embedding macros in strings"
507+
508+
SQLMesh macros support placing macro values at the end of a column name using `column_@x`.
509+
510+
However, if you wish to substitute the variable anywhere else in the identifier, you need to use the more explicit curly brace syntax `@{}` to avoid ambiguity. For example, these are valid uses: `@{x}_column` or `my_@{x}_column`.
511+
512+
Learn more about embedding macros in strings [above](#embedding-variables-in-strings)
452513

453514
### @IF
454515

@@ -1087,7 +1148,9 @@ The `template` can contain the following placeholders that will be substituted:
10871148
- `@{schema_name}` - The name of the physical schema that SQLMesh is using for the model version table, eg `sqlmesh__landing`
10881149
- `@{table_name}` - The name of the physical table that SQLMesh is using for the model version, eg `landing__customers__2517971505`
10891150

1090-
It can be used in a `MODEL` block:
1151+
Note the use of the curly brace syntax `@{}` in the template placeholders - learn more [above](#embedding-variables-in-strings).
1152+
1153+
The `@resolve_template` macro can be used in a `MODEL` block:
10911154

10921155
```sql linenums="1" hl_lines="5"
10931156
MODEL (

docs/concepts/models/external_models.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,9 @@ FROM
7070
@{gateway}_db.external_table;
7171
```
7272

73-
This table will be named differently depending on which `--gateway` SQLMesh is run with. For example:
73+
This table will be named differently depending on which `--gateway` SQLMesh is run with (learn more about the curly brace `@{gateway}` syntax [here](../../concepts/macros/sqlmesh_macros.md#embedding-variables-in-strings)).
74+
75+
For example:
7476

7577
- `sqlmesh --gateway dev plan` - SQLMesh will try to query `dev_db.external_table`
7678
- `sqlmesh --gateway prod plan` - SQLMesh will try to query `prod_db.external_table`

0 commit comments

Comments
 (0)