公用表表达式 (CTE)

适用于:勾选“是” Databricks SQL 勾选“是” Databricks Runtime

定义一个可在 SQL 语句的范围内多次引用的临时结果集。 CTE 主要在 SELECT 语句中使用。

语法

WITH [ RECURSIVE ] common_table_expression [, ...]

common_table_expression
  view_identifier [ ( column_identifier [, ...] ) ] [ AS ] ( query | recursive_query )

recursive_query
  base_case_query UNION ALL step_case_query

参数

  • 递归的

    重要

    此功能目前以公共预览版提供。

    适用于:选中标记为“是” Databricks Runtime 17.0 及更高版本

    指定时,公共表表达式可以包含一个 recursive_query

  • view_identifier

    可按其引用 common_table_expression 的标识符

  • column_identifier

    可按其引用 common_table_expression 的列的可选标识符。

    如果指定了 column_identifier,则其数量必须与 query 返回的列数相匹配。 如果未指定任何名称,则列名称派生自 query

  • 查询

    生成结果集的查询。

  • recursive_query

    适用于:选中标记为“是” Databricks Runtime 17.0 及更高版本

    RECURSIVE 指定后,CTE 的查询可以引用自身。 这使得 CTE 成为 递归 CTE

    • base_case_subquery

      此子查询提供递归的基础或开始点。 它不得引用 view_identifier

    • step_subquery

      此子查询基于上一步生成一组新的行。 要递归,必须引用view_identifier

局限性

  • 对于UPDATEDELETEMERGE语句,不支持递归 CTE。
  • step_subquery 不得包含对 view_identifier. 的关联列引用。
  • 从递归部分调用随机数生成器可以在每次迭代中生成相同的随机数。
  • 递归步骤数限制为 100。
  • 结果集的总大小限制为 1,000,000 行。

示例

-- CTE with multiple column aliases
> WITH t(x, y) AS (SELECT 1, 2)
  SELECT * FROM t WHERE x = 1 AND y = 2;
   1   2

-- CTE in CTE definition
> WITH t AS (
    WITH t2 AS (SELECT 1)
    SELECT * FROM t2)
  SELECT * FROM t;
   1

-- CTE in subquery
> SELECT max(c) FROM (
    WITH t(c) AS (SELECT 1)
    SELECT * FROM t);
      1

-- CTE in subquery expression
> SELECT (WITH t AS (SELECT 1)
          SELECT * FROM t);
                1

-- CTE in CREATE VIEW statement
> CREATE VIEW v AS
    WITH t(a, b, c, d) AS (SELECT 1, 2, 3, 4)
    SELECT * FROM t;
> SELECT * FROM v;
   1   2   3   4

-- CTE names are scoped
> WITH t  AS (SELECT 1),
       t2 AS (
        WITH t AS (SELECT 2)
        SELECT * FROM t)
SELECT * FROM t2;
   2

递归查询示例

定向图示例

以下示例演示如何查找从纽约到其他城市的所有目的地,包括直接路线和通过其他城市的路线:

> DROP VIEW IF EXISTS routes;
> CREATE TEMPORARY VIEW routes(origin, destination) AS VALUES
  ('New York', 'Washington'),
  ('New York', 'Boston'),
  ('Boston', 'New York'),
  ('Washington', 'Boston'),
  ('Washington', 'Raleigh');

> WITH RECURSIVE destinations_from_new_york AS (
    SELECT 'New York' AS destination, ARRAY('New York') AS path, 0 AS length
    UNION ALL
    SELECT r.destination, CONCAT(d.path, ARRAY(r.destination)), d.length + 1
      FROM routes AS r
      JOIN destinations_from_new_york AS d
        ON d.destination = r.origin AND NOT ARRAY_CONTAINS(d.path, r.destination)
  )
  SELECT * FROM destinations_from_new_york;
  destination path                                length
  ----------- ----------------------------------- ------
  New York    ["New York"]                        0
  Washington  ["New York","Washington"]           1
  Boston      ["New York","Boston"]               1
  Boston      ["New York","Washington","Boston"]  2
  Raleigh     ["New York","Washington","Raleigh"] 2

以下示例演示如何遍历定向图并检查遍历中是否存在无限循环:

> DROP TABLE IF EXISTS graph;
> CREATE TABLE IF NOT EXISTS graph( f int, t int, label string );
> INSERT INTO graph VALUES
    (1, 2, 'arc 1 -> 2'),
    (1, 3, 'arc 1 -> 3'),
    (2, 3, 'arc 2 -> 3'),
    (1, 4, 'arc 1 -> 4'),
    (4, 5, 'arc 4 -> 5'),
    (5, 1, 'arc 5 -> 1');
> WITH RECURSIVE search_graph(f, t, label, path, cycle) AS (
    SELECT *, array(struct(g.f, g.t)), false FROM graph g
    UNION ALL
    SELECT g.*, path || array(struct(g.f, g.t)), array_contains(path, struct(g.f, g.t))
      FROM graph g, search_graph sg
      WHERE g.f = sg.t AND NOT cycle
  )
  SELECT path FROM search_graph;

结果是一组增加长度的路径,同时避免循环,最终以非无限路径结尾

[{"f":1,"t":4},{"f":4,"t":5},{"f":5,"t":1},{"f":1,"t":2},{"f":2,"t":3}]`

访问所有节点并在汇点“3”明确停止。 通过修改最后一行以按 path 列排序来实现深度优先搜索:

SELECT * FROM search_graph ORDER BY path;

以下示例演示基本的递归技术:

-- Simple loop-like example.
> WITH RECURSIVE r(col) AS (
    SELECT 'a'
    UNION ALL
    SELECT col || char(ascii(substr(col, -1)) + 1) FROM r WHERE LENGTH(col) < 10
  )
  SELECT * FROM r;
  col
  ----------
  a
  ab
  abc
  abcd
  abcde
  abcdef
  abcdefg
  abcdefgh
  abcdefghi
  abcdefghij

-- Another loop-like example.
> WITH RECURSIVE t(n) AS (
    SELECT 'foo'
    UNION ALL
    SELECT n || ' bar' FROM t WHERE length(n) < 20
  )
  SELECT n AS is_text FROM t;
  is_text
  -------
  foo
  foo bar
  foo bar bar
  foo bar bar bar
  foo bar bar bar bar
  foo bar bar bar bar bar

-- Invoking a recursive CTE from another CTE.
> WITH RECURSIVE x(id) AS (VALUES (1) UNION ALL SELECT id+1 FROM x WHERE id < 5),
                 y(id) AS (VALUES (1) UNION ALL SELECT id+1 FROM x WHERE id < 10)
  SELECT y.*, x.* FROM y LEFT JOIN x USING (id);
  id  id
  --  --
   1  1
   4  4
   6  null
   3  3
   5  5
   2  2

-- An example of invoking a non-recursive CTE from a recursive CTE.
> WITH RECURSIVE
    y (id) AS (VALUES (1)),
    x (id) AS (SELECT * FROM y UNION ALL SELECT id+1 FROM x WHERE id < 5)
  SELECT * FROM x;
  id
  --
   1
   2
   3
   4
   5

-- Temporary views
> CREATE OR REPLACE TEMPORARY VIEW nums AS
    WITH RECURSIVE nums (n) AS (
      VALUES (1)
      UNION ALL
      SELECT n+1 FROM nums WHERE n < 6
    )
    SELECT * FROM nums;
> SELECT * FROM nums;
  n
  __
   1
   2
   3
   4
   5
   6

> INSERT into a table
> CREATE TABLE rt(level INT);

> WITH RECURSIVE r(level) AS (
    VALUES (0)
    UNION ALL
    SELECT level + 1 FROM r WHERE level < 9
  )
  INSERT INTO rt SELECT * FROM r;

> SELECT * from rt;
  level
  -----
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9

-- Nested recursive CTE are supported (only inside anchor)
-- Returns a triangular half - total of 55 rows - of a 10x10 square of numbers 1-10
> WITH RECURSIVE t(j) AS (
    WITH RECURSIVE s(i) AS (
      VALUES (1)
      UNION ALL
      SELECT i+1 FROM s WHERE i < 10
    )
    SELECT i FROM s
    UNION ALL
    SELECT j+1 FROM t WHERE j < 10
  )
  SELECT * FROM t;
  j
  --
   1
   2
   2
   3
   3
   3
   4
  ...
   9
  10
  10
  10
  10
  10
  10
  10
  10
  10
  10

-- Honoring LIMIT
> WITH RECURSIVE t(n) AS (
    VALUES (1)
    UNION ALL
    SELECT n+1 FROM t
  )
  SELECT * FROM t LIMIT 10;
  n
  --
   1
   2
   3
   4
   5
   6
   7
   8
   9
  10

-- Organizational tree examples
-- Extracting all departments under 'A'
> CREATE TABLE department (
    id INTEGER, -- department ID
    parent_department INTEGER, -- upper department ID
    name string -- department name
  );

> INSERT INTO department VALUES
    (0, NULL, 'ROOT'),
    (1, 0, 'A'),
    (2, 1, 'B'),
    (3, 2, 'C'),
    (4, 2, 'D'),
    (5, 0, 'E'),
    (6, 4, 'F'),
    (7, 5, 'G');

> WITH RECURSIVE subdepartment AS (
    SELECT name as root_name, * FROM department WHERE name = 'A'
    UNION ALL
    SELECT sd.root_name, d.* FROM department AS d, subdepartment AS sd
     WHERE d.parent_department = sd.id
  )
  SELECT * FROM subdepartment ORDER BY name;
  root_name id  parent_department  name
  --------- --  -----------------  ----
          A	 1                  0     A
          A	 2	                1     B
          A	 3                  2     C
          A	 4                  2     D
          A	 6	                4     F

-- The same as above but extracting the level number:
> WITH RECURSIVE subdepartment(level, id, parent_department, name) AS (
    SELECT 1, * FROM department WHERE name = 'A'
    UNION ALL
    SELECT sd.level + 1, d.* FROM department AS d, subdepartment AS sd
     WHERE d.parent_department = sd.id
  )
  SELECT * FROM subdepartment ORDER BY name;
  level  id  parent_department  name
  -----  --  -----------------  ----
      1   1                  0     A
      2   2                  1     B
      3   3                  2     C
      3   4                  2     D
      4   6                  4     F

-- The same as above but only shows level 2 or more
> WITH RECURSIVE subdepartment(level, id, parent_department, name) AS (
    SELECT 1, * FROM department WHERE name = 'A'
    UNION ALL
    SELECT sd.level + 1, d.* FROM department AS d, subdepartment AS sd
     WHERE d.parent_department = sd.id
  )
  SELECT * FROM subdepartment WHERE level >= 2 ORDER BY name;
  level id  parent_department  name
  ----- --  -----------------  ----
  2      2                  1     B
  3      3                  2     C
  3      4                  2     D
  4      6                  4     F

-- Binary tree examples
-- Another tree example is to find all paths from the second level nodes "2" and "3" to all other nodes.
> CREATE TABLE tree(id INTEGER, parent_id INTEGER);
> INSERT INTO tree
    VALUES (1, NULL), (2, 1), (3, 1), (4, 2), (5, 2), ( 6, 2), ( 7, 3), ( 8, 3),
           (9,    4), (10,4), (11,7), (12,7), (13,7), (14, 9), (15,11), (16,11);

> WITH RECURSIVE t(id, path) AS (
    VALUES(1,cast(array() as array<Int>))
    UNION ALL
    SELECT tree.id, t.path || array(tree.id)
      FROM tree JOIN t ON (tree.parent_id = t.id)
  )
  SELECT t1.*, t2.*
    FROM t AS t1 JOIN t AS t2
      ON (t1.path[0] = t2.path[0] AND
          size(t1.path) = 1 AND
          size(t2.path) > 1)
  ORDER BY t1.id, t2.id;
  id  path  id  path
  --  ----  --  ------------
   2  [2]    4  [2,4]
   2  [2]    5  [2,5]
   2  [2]    6  [2,6]
   2  [2]    9  [2,4,9]
   2  [2]   10  [2,4,10]
   2  [2]   14  [2,4,9,14]
   3  [3]    7  [3,7]
   3  [3]    8  [3,8]
   3  [3]   11  [3,7,11]
   3  [3]   12  [3,7,12]
   3  [3]   13  [3,7,13]
   3  [3]   15  [3,7,11,15]
   3  [3]   16  [3,7,11,16]

-- Another tree example counts the leaf nodes reachable from nodes "2" and "3"
> CREATE TABLE tree(id INTEGER, parent_id INTEGER);
> INSERT INTO tree
    VALUES (1, NULL), ( 2,1), (3,1 ), ( 4,2), ( 5,2), ( 6,2), ( 7, 3), ( 8, 3),
           (9,    4), (10,4), (11,7), (12,7), (13,7), (14,9), (15,11), (16,11);

> WITH RECURSIVE t(id, path) AS (
    VALUES(1,cast(array() as array<Int>))
    UNION ALL
    SELECT tree.id, t.path || array(tree.id)
      FROM tree JOIN t ON (tree.parent_id = t.id)
  )
  SELECT t1.id, count(*)
    FROM t AS t1 JOIN t AS t2
      ON (t1.path[0] = t2.path[0] AND
          size(t1.path) = 1 AND
          size(t2.path) > 1)
  GROUP BY t1.id
  ORDER BY t1.id;
  id  count(1)
  --  --------
   2         6
   3         7

-- Another tree example that is listing the paths to leaf node:
> CREATE TABLE tree(id INTEGER, parent_id INTEGER);
> INSERT INTO tree
    VALUES (1, NULL), ( 2,1), ( 3,1), ( 4,2), ( 5,2), ( 6,2), ( 7, 3), ( 8, 3),
           (9,    4), (10,4), (11,7), (12,7), (13,7), (14,9), (15,11), (16,11);

> WITH RECURSIVE t(id, path) AS (
    VALUES(1,cast(array() as array<Int>))
    UNION ALL
    SELECT tree.id, t.path || array(tree.id)
      FROM tree JOIN t ON (tree.parent_id = t.id)
  )
  SELECT id, path FROM t
    ORDER BY id;
  id  path
  --  -----------
   1  []
   2  [2]
   3  [3]
   4  [2,4]
   5  [2,5]
   6  [2,6]
   7  [3,7]
   8  [3,8]
   9  [2,4,9]
  10  [2,4,10]
  11  [3,7,11]
  12  [3,7,12]
  13  [3,7,13]
  14  [2,4,9,14]
  15  [3,7,11,15]
  16  [3,7,11,16]

-- Sudoku solver (https://sqlite.org)
-- Highlighted text represent the row-wise scan of a 9x9 Sudoku grid where missing values are represented with spaces.
> WITH RECURSIVE generate_series(gs) AS (
    SELECT 1
    UNION ALL
    SELECT gs+1 FROM generate_series WHERE gs < 9
  ),
  x( s, ind ) AS
    ( SELECT sud, position( ' ' IN sud )
      FROM  (SELECT '4  8725  5  64 213 29   8      6 73 1 8 2 4  97  15 2 3  2  1    659   28 2 37946'::STRING
        AS sud) xx
    UNION ALL
    SELECT substr( s, 1, ind - 1 ) || z || substr( s, ind + 1 ),
           position(' ' in repeat('x',ind) || substr( s, ind + 1 ) )
    FROM x,
     (SELECT gs::STRING AS z FROM generate_series) z
    WHERE ind > 0
      AND NOT EXISTS ( SELECT *
        FROM generate_series
        WHERE z.z = substr( s, ( (ind - 1 ) DIV 9 ) * 9 + gs, 1 )
           OR    z.z = substr( s, mod( ind - 1, 9 ) - 8 + gs * 9, 1 )
           OR    z.z = substr( s, mod( ( ( ind - 1 ) DIV 3 ), 3 ) * 3
                       + ( ( ind - 1 ) DIV 27 ) * 27 + gs
                       + ( ( gs - 1 ) DIV 3 ) * 6, 1 )
     )
  )
  SELECT s FROM x WHERE ind = 0;
  431872569587649213629351874245968731168723495973415628394286157716594382852137946

-- You can underspecify the problem, for example by replacing the leading '4' with space ' ' which will yield two solutions
431872569587649213629351874245968731168723495973415628394286157716594382852137946
631872594587649213429351867245968731168723459973415628394286175716594382852137946

-- Finding prime numbers (Sieve of Eratosthenes, https://wiki.postgresql.org)
-- This is an implementation of the Sieve of Erastothenes algorithm for finding the prime numbers up to 150.
> WITH RECURSIVE t1(n) AS (
    VALUES(2)
    UNION ALL
    SELECT n+1 FROM t1 WHERE n < 100
  ),
  t2 (n, i) AS (
    SELECT 2*n, 2
      FROM t1 WHERE 2*n <= 150
    UNION ALL
    (
      WITH t3(k) AS (
        SELECT max(i) OVER () + 1 FROM t2
      ),
      t4(k) AS (
        SELECT DISTINCT k FROM t3
      )
      SELECT k*n, k
        FROM t1 CROSS JOIN t4
        WHERE k*k <= 150
    )
  )
  SELECT n FROM t1 EXCEPT
    SELECT n FROM t2
  ORDER BY 1;
  2
  3
  5
  7
  11
  13
  17
  19
  ...