feat(lsp): add go to definition for ctes

benfdking · benfdking · commit d5c8add28cf1 · 2025-05-23T18:38:21.000+02:00
diff --git a/examples/sushi/models/customers.sql b/examples/sushi/models/customers.sql
@@ -17,7 +17,7 @@ CREATE VIEW raw.demographics AS (
   SELECT 1 AS customer_id, '00000' AS zip
 );
 
-WITH current_marketing AS (
+WITH current_marketing_outer AS (
   SELECT
     customer_id,
     status
@@ -29,7 +29,15 @@ SELECT DISTINCT
   m.status,
   d.zip
   FROM sushi.orders AS o
-LEFT JOIN current_marketing AS m
+LEFT JOIN (
+  WITH current_marketing AS (
+    SELECT
+      customer_id,
+      status
+    FROM current_marketing_outer
+  )
+  SELECT * FROM current_marketing
+) AS m
   ON o.customer_id = m.customer_id
 LEFT JOIN raw.demographics AS d
   ON o.customer_id = d.customer_id
diff --git a/sqlmesh/lsp/main.py b/sqlmesh/lsp/main.py
@@ -265,21 +265,31 @@ def goto_definition(
                     raise RuntimeError(f"No context found for document: {document.path}")
 
                 references = get_references(self.lsp_context, uri, params.position)
-                return [
-                    types.LocationLink(
-                        target_uri=reference.uri,
-                        target_selection_range=types.Range(
+                location_links = []
+                for reference in references:
+                    # Use target_range if available (for CTEs), otherwise default to start of file
+                    if reference.target_range:
+                        target_range = reference.target_range
+                        target_selection_range = reference.target_range
+                    else:
+                        target_range = types.Range(
                             start=types.Position(line=0, character=0),
                             end=types.Position(line=0, character=0),
-                        ),
-                        target_range=types.Range(
+                        )
+                        target_selection_range = types.Range(
                             start=types.Position(line=0, character=0),
                             end=types.Position(line=0, character=0),
-                        ),
-                        origin_selection_range=reference.range,
+                        )
+
+                    location_links.append(
+                        types.LocationLink(
+                            target_uri=reference.uri,
+                            target_selection_range=target_selection_range,
+                            target_range=target_range,
+                            origin_selection_range=reference.range,
+                        )
                     )
-                    for reference in references
-                ]
+                return location_links
             except Exception as e:
                 ls.show_message(f"Error getting references: {e}", types.MessageType.Error)
                 return []
diff --git a/sqlmesh/lsp/reference.py b/sqlmesh/lsp/reference.py
@@ -5,23 +5,26 @@
 from sqlmesh.core.model.definition import SqlModel
 from sqlmesh.lsp.context import LSPContext, ModelTarget, AuditTarget
 from sqlglot import exp
+from sqlglot.optimizer.scope import build_scope
 from sqlmesh.lsp.uri import URI
 from sqlmesh.utils.pydantic import PydanticModel
 
 
 class Reference(PydanticModel):
     """
-    A reference to a model.
+    A reference to a model or CTE.
 
     Attributes:
         range: The range of the reference in the source file
-        uri: The uri of the referenced model
-        description: The description of the referenced model
+        uri: The uri of the referenced model or file
+        description: The description of the referenced model or CTE
+        target_range: The range of the definition for go-to-definition (optional, used for CTEs)
     """
 
     range: Range
     uri: str
     description: t.Optional[str] = None
+    target_range: t.Optional[Range] = None
 
 
 def by_position(position: Position) -> t.Callable[[Reference], bool]:
@@ -87,6 +90,7 @@ def get_model_definitions_for_a_path(
     - Need to normalize it before matching
     - Try get_model before normalization
     - Match to models that the model refers to
+    - Also find CTE references within the query
     """
     path = document_uri.to_path()
     if path.suffix != ".sql":
@@ -125,64 +129,121 @@ def get_model_definitions_for_a_path(
     # Find all possible references
     references = []
 
-    # Get SQL query and find all table references
-    tables = list(query.find_all(exp.Table))
-    if len(tables) == 0:
-        return []
-
     with open(file_path, "r", encoding="utf-8") as file:
         read_file = file.readlines()
 
-    for table in tables:
-        # Normalize the table reference
-        unaliased = table.copy()
-        if unaliased.args.get("alias") is not None:
-            unaliased.set("alias", None)
-        reference_name = unaliased.sql(dialect=dialect)
-        try:
-            normalized_reference_name = normalize_model_name(
-                reference_name,
-                default_catalog=lint_context.context.default_catalog,
-                dialect=dialect,
-            )
-            if normalized_reference_name not in depends_on:
-                continue
-        except Exception:
-            # Skip references that cannot be normalized
-            continue
-
-        # Get the referenced model uri
-        referenced_model = lint_context.context.get_model(
-            model_or_snapshot=normalized_reference_name, raise_if_missing=False
-        )
-        if referenced_model is None:
-            continue
-        referenced_model_path = referenced_model._path
-        # Check whether the path exists
-        if not referenced_model_path.is_file():
-            continue
-        referenced_model_uri = URI.from_path(referenced_model_path)
-
-        # Extract metadata for positioning
-        table_meta = TokenPositionDetails.from_meta(table.this.meta)
-        table_range = _range_from_token_position_details(table_meta, read_file)
-        start_pos = table_range.start
-        end_pos = table_range.end
-
-        # If there's a catalog or database qualifier, adjust the start position
-        catalog_or_db = table.args.get("catalog") or table.args.get("db")
-        if catalog_or_db is not None:
-            catalog_or_db_meta = TokenPositionDetails.from_meta(catalog_or_db.meta)
-            catalog_or_db_range = _range_from_token_position_details(catalog_or_db_meta, read_file)
-            start_pos = catalog_or_db_range.start
-
-        references.append(
-            Reference(
-                uri=referenced_model_uri.value,
-                range=Range(start=start_pos, end=end_pos),
-                description=referenced_model.description,
-            )
-        )
+    # Build scope tree to properly handle nested CTEs
+    root_scope = build_scope(query)
+
+    if root_scope:
+        # Traverse all scopes to find CTE definitions and table references
+        for scope in root_scope.traverse():
+            # Build a map of CTE names to their definitions within this scope
+            cte_definitions = {}
+
+            # For CTEs defined in this scope
+            for cte in scope.ctes:
+                if cte.alias:
+                    cte_definitions[cte.alias] = cte
+
+            # Also include CTEs from parent scopes (for references inside nested CTEs)
+            parent = scope.parent
+            while parent:
+                for cte in parent.ctes:
+                    if cte.alias and cte.alias not in cte_definitions:
+                        cte_definitions[cte.alias] = cte
+                parent = parent.parent
+
+            # Get all table references in this scope
+            tables = list(scope.find_all(exp.Table))
+
+            for table in tables:
+                table_name = table.name
+
+                # Check if this table reference is a CTE in the current scope
+                if table_name in cte_definitions:
+                    try:
+                        # This is a CTE reference - create a reference to the CTE definition
+                        cte_def = cte_definitions[table_name]
+                        args = cte_def.args["alias"]
+                        if args and isinstance(args, exp.TableAlias):
+                            identifier = args.this
+                            if isinstance(identifier, exp.Identifier):
+                                meta = identifier.meta
+
+                                table_meta_obj = TokenPositionDetails.from_meta(meta)
+                                target_range = _range_from_token_position_details(
+                                    table_meta_obj, read_file
+                                )
+
+                                table_meta_obj = TokenPositionDetails.from_meta(table.this.meta)
+                                table_range = _range_from_token_position_details(
+                                    table_meta_obj, read_file
+                                )
+
+                                references.append(
+                                    Reference(
+                                        uri=document_uri.value,  # Same file
+                                        range=table_range,
+                                        target_range=target_range,
+                                    )
+                                )
+                    except Exception:
+                        pass
+                    continue
+
+                # For non-CTE tables, process as before (external model references)
+                # Normalize the table reference
+                unaliased = table.copy()
+                if unaliased.args.get("alias") is not None:
+                    unaliased.set("alias", None)
+                reference_name = unaliased.sql(dialect=dialect)
+                try:
+                    normalized_reference_name = normalize_model_name(
+                        reference_name,
+                        default_catalog=lint_context.context.default_catalog,
+                        dialect=dialect,
+                    )
+                    if normalized_reference_name not in depends_on:
+                        continue
+                except Exception:
+                    # Skip references that cannot be normalized
+                    continue
+
+                # Get the referenced model uri
+                referenced_model = lint_context.context.get_model(
+                    model_or_snapshot=normalized_reference_name, raise_if_missing=False
+                )
+                if referenced_model is None:
+                    continue
+                referenced_model_path = referenced_model._path
+                # Check whether the path exists
+                if not referenced_model_path.is_file():
+                    continue
+                referenced_model_uri = URI.from_path(referenced_model_path)
+
+                # Extract metadata for positioning
+                table_meta = TokenPositionDetails.from_meta(table.this.meta)
+                table_range = _range_from_token_position_details(table_meta, read_file)
+                start_pos = table_range.start
+                end_pos = table_range.end
+
+                # If there's a catalog or database qualifier, adjust the start position
+                catalog_or_db = table.args.get("catalog") or table.args.get("db")
+                if catalog_or_db is not None:
+                    catalog_or_db_meta = TokenPositionDetails.from_meta(catalog_or_db.meta)
+                    catalog_or_db_range = _range_from_token_position_details(
+                        catalog_or_db_meta, read_file
+                    )
+                    start_pos = catalog_or_db_range.start
+
+                references.append(
+                    Reference(
+                        uri=referenced_model_uri.value,
+                        range=Range(start=start_pos, end=end_pos),
+                        description=referenced_model.description,
+                    )
+                )
 
     return references
 
diff --git a/tests/lsp/test_reference_cte.py b/tests/lsp/test_reference_cte.py
@@ -0,0 +1,64 @@
+import re
+from sqlmesh.core.context import Context
+from sqlmesh.lsp.context import LSPContext, ModelTarget
+from sqlmesh.lsp.reference import get_references
+from sqlmesh.lsp.uri import URI
+from lsprotocol.types import Range, Position
+import typing as t
+
+
+def test_cte_parsing():
+    context = Context(paths=["examples/sushi"])
+    lsp_context = LSPContext(context)
+
+    # Find model URIs
+    sushi_customers_path = next(
+        path
+        for path, info in lsp_context.map.items()
+        if isinstance(info, ModelTarget) and "sushi.customers" in info.names
+    )
+
+    with open(sushi_customers_path, "r", encoding="utf-8") as file:
+        read_file = file.readlines()
+
+    # Find position of the cte reference
+    ranges = find_ranges_from_regex(read_file, r"current_marketing(?!_outer)")
+    assert len(ranges) == 2
+    position = Position(line=ranges[1].start.line, character=ranges[1].start.character + 4)
+    references = get_references(lsp_context, URI.from_path(sushi_customers_path), position)
+    assert len(references) == 1
+    assert references[0].uri == URI.from_path(sushi_customers_path).value
+    assert references[0].description is None
+    assert (
+        references[0].range.start.line == ranges[1].start.line
+    )  # The reference location (where we clicked)
+    assert (
+        references[0].target_range.start.line == ranges[0].start.line
+    )  # The CTE definition location
+
+    # Find the position of the current_marketing_outer reference
+    ranges = find_ranges_from_regex(read_file, r"current_marketing_outer")
+    assert len(ranges) == 2
+    position = Position(line=ranges[1].start.line, character=ranges[1].start.character + 4)
+    references = get_references(lsp_context, URI.from_path(sushi_customers_path), position)
+    assert len(references) == 1
+    assert references[0].uri == URI.from_path(sushi_customers_path).value
+    assert references[0].description is None
+    assert (
+        references[0].range.start.line == ranges[1].start.line
+    )  # The reference location (where we clicked)
+    assert (
+        references[0].target_range.start.line == ranges[0].start.line
+    )  # The CTE definition location
+
+
+def find_ranges_from_regex(read_file: t.List[str], regex: str) -> t.List[Range]:
+    """Find all ranges in the read file that match the regex."""
+    return [
+        Range(
+            start=Position(line=line_number, character=match.start()),
+            end=Position(line=line_number, character=match.end()),
+        )
+        for line_number, line in enumerate(read_file)
+        for match in [m for m in [re.search(regex, line)] if m]
+    ]