ibis: bug: regression: slow to generate SQL, generated SQL is much slower
What happened?
I have some very complicated ibis expression.
On 8.0.0:
print(ibis.to_sql(expr)) # 0.1 sec
expr.cache(). # 1.5 sec
and I get the sql
WITH t0 AS (
SELECT
t9.prefix AS prefix,
CASE
WHEN (
TRIM(
LOWER(
NULLIF(
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(REGEXP_REPLACE(t9.first_name, '[^\\x00-\\x7F]+', '', 'g'), '[^A-Za-z0-9]+', ' ', 'g'),
'\\s+',
' ',
'g'
),
'
'
),
''
)
),
'
'
) = TRIM(
LOWER(
NULLIF(
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(REGEXP_REPLACE(t9.last_name, '[^\\x00-\\x7F]+', '', 'g'), '[^A-Za-z0-9]+', ' ', 'g'),
'\\s+',
' ',
'g'
),
'
'
),
''
)
),
'
'
)
AND COALESCE(
ARRAY_LENGTH(
STR_SPLIT(
NULLIF(
TRIM(
REGEXP_REPLACE(
TRIM(
LOWER(
NULLIF(
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(REGEXP_REPLACE(t9.first_name, '[^\\x00-\\x7F]+', '', 'g'), '[^A-Za-z0-9]+', ' ', 'g'),
'\\s+',
' ',
'g'
),
'
'
),
''
)
),
'
'
),
'\\s+',
' ',
'g'
),
'
'
),
''
),
' '
)
),
CAST(0 AS TINYINT)
) = CAST(1 AS TINYINT)
AND COALESCE(
ARRAY_LENGTH(
STR_SPLIT(
NULLIF(
TRIM(
REGEXP_REPLACE(
TRIM(
LOWER(
NULLIF(
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(REGEXP_REPLACE(t9.last_name, '[^\\x00-\\x7F]+', '', 'g'), '[^A-Za-z0-9]+', ' ', 'g'),
'\\s+',
' ',
'g'
),
'
'
),
''
)
),
'
'
),
'\\s+',
' ',
'g'
),
'
'
),
''
),
' '
)
),
CAST(0 AS TINYINT)
) = CAST(1 AS TINYINT)
)
THEN NULL
ELSE t9.first_name
END AS first_name,
t9.middle_name AS middle_name,
t9.last_name AS last_name,
t9.suffix AS suffix,
t9.nickname AS nickname
FROM main.ibis_cache_m5utd5pimjfz7jmp2ojvapbyim AS t9
), t1 AS (
SELECT
t0.prefix AS prefix,
t0.first_name AS first_name,
t0.middle_name AS middle_name,
t0.last_name AS last_name,
t0.suffix AS suffix,
t0.nickname AS nickname,
STR_SPLIT(t0.prefix, ' ') AS prefix_tokens,
STR_SPLIT(t0.first_name, ' ') AS first_name_tokens,
STR_SPLIT(t0.middle_name, ' ') AS middle_name_tokens,
STR_SPLIT(t0.last_name, ' ') AS last_name_tokens,
STR_SPLIT(t0.suffix, ' ') AS suffix_tokens,
STR_SPLIT(t0.nickname, ' ') AS nickname_tokens
FROM t0
), t2 AS (
SELECT
t1.prefix AS prefix,
t1.first_name AS first_name,
t1.middle_name AS middle_name,
t1.last_name AS last_name,
t1.suffix AS suffix,
t1.nickname AS nickname,
t1.prefix_tokens AS prefix_tokens,
t1.first_name_tokens AS first_name_tokens,
t1.middle_name_tokens AS middle_name_tokens,
t1.last_name_tokens AS last_name_tokens,
t1.suffix_tokens AS suffix_tokens,
t1.nickname_tokens AS nickname_tokens,
CASE
WHEN (
ARRAY_LENGTH(t1.prefix_tokens) = CAST(1 AS TINYINT)
)
THEN UPPER(t1.prefix)
ELSE NULL
END AS prefix_single,
CASE
WHEN (
ARRAY_LENGTH(t1.first_name_tokens) = CAST(1 AS TINYINT)
)
THEN UPPER(t1.first_name)
ELSE NULL
END AS first_name_single,
CASE
WHEN (
ARRAY_LENGTH(t1.middle_name_tokens) = CAST(1 AS TINYINT)
)
THEN UPPER(t1.middle_name)
ELSE NULL
END AS middle_name_single,
CASE
WHEN (
ARRAY_LENGTH(t1.last_name_tokens) = CAST(1 AS TINYINT)
)
THEN UPPER(t1.last_name)
ELSE NULL
END AS last_name_single,
CASE
WHEN (
ARRAY_LENGTH(t1.suffix_tokens) = CAST(1 AS TINYINT)
)
THEN UPPER(t1.suffix)
ELSE NULL
END AS suffix_single,
CASE
WHEN (
ARRAY_LENGTH(t1.nickname_tokens) = CAST(1 AS TINYINT)
)
THEN UPPER(t1.nickname)
ELSE NULL
END AS nickname_single
FROM t1
), t3 AS (
SELECT
t2.prefix AS prefix,
t2.first_name AS first_name,
t2.middle_name AS middle_name,
t2.last_name AS last_name,
t2.suffix AS suffix,
t2.nickname AS nickname,
t2.prefix_tokens AS prefix_tokens,
t2.first_name_tokens AS first_name_tokens,
t2.middle_name_tokens AS middle_name_tokens,
t2.last_name_tokens AS last_name_tokens,
t2.suffix_tokens AS suffix_tokens,
t2.nickname_tokens AS nickname_tokens,
t2.prefix_single AS prefix_single,
t2.first_name_single AS first_name_single,
t2.middle_name_single AS middle_name_single,
t2.last_name_single AS last_name_single,
t2.suffix_single AS suffix_single,
t2.nickname_single AS nickname_single,
CAST([t2.first_name_single, t2.middle_name_single, t2.last_name_single, t2.suffix_single, t2.nickname_single] AS TEXT[]) AS singles_except_prefix,
CAST([t2.prefix_single, t2.middle_name_single, t2.last_name_single, t2.suffix_single, t2.nickname_single] AS TEXT[]) AS singles_except_first_name,
CAST([t2.prefix_single, t2.first_name_single, t2.last_name_single, t2.suffix_single, t2.nickname_single] AS TEXT[]) AS singles_except_middle_name,
CAST([t2.prefix_single, t2.first_name_single, t2.middle_name_single, t2.suffix_single, t2.nickname_single] AS TEXT[]) AS singles_except_last_name,
CAST([t2.prefix_single, t2.first_name_single, t2.middle_name_single, t2.last_name_single, t2.nickname_single] AS TEXT[]) AS singles_except_suffix,
CAST([t2.prefix_single, t2.first_name_single, t2.middle_name_single, t2.last_name_single, t2.suffix_single] AS TEXT[]) AS singles_except_nickname
FROM t2
), t4 AS (
SELECT
t3.prefix AS prefix,
t3.first_name AS first_name,
t3.middle_name AS middle_name,
t3.last_name AS last_name,
t3.suffix AS suffix,
t3.nickname AS nickname,
t3.prefix_tokens AS prefix_tokens,
t3.first_name_tokens AS first_name_tokens,
t3.middle_name_tokens AS middle_name_tokens,
t3.last_name_tokens AS last_name_tokens,
t3.suffix_tokens AS suffix_tokens,
t3.nickname_tokens AS nickname_tokens,
t3.prefix_single AS prefix_single,
t3.first_name_single AS first_name_single,
t3.middle_name_single AS middle_name_single,
t3.last_name_single AS last_name_single,
t3.suffix_single AS suffix_single,
t3.nickname_single AS nickname_single,
t3.singles_except_prefix AS singles_except_prefix,
t3.singles_except_first_name AS singles_except_first_name,
t3.singles_except_middle_name AS singles_except_middle_name,
t3.singles_except_last_name AS singles_except_last_name,
t3.singles_except_suffix AS singles_except_suffix,
t3.singles_except_nickname AS singles_except_nickname,
LIST_FILTER(
t3.prefix_tokens,
__ibis_param_token__ -> NOT ARRAY_CONTAINS(t3.singles_except_prefix, UPPER(__ibis_param_token__))
) AS prefix_tokens_filtered,
LIST_FILTER(
t3.first_name_tokens,
__ibis_param_token__ -> NOT ARRAY_CONTAINS(t3.singles_except_first_name, UPPER(__ibis_param_token__))
) AS first_name_tokens_filtered,
LIST_FILTER(
t3.middle_name_tokens,
__ibis_param_token__ -> NOT ARRAY_CONTAINS(t3.singles_except_middle_name, UPPER(__ibis_param_token__))
) AS middle_name_tokens_filtered,
LIST_FILTER(
t3.last_name_tokens,
__ibis_param_token__ -> NOT ARRAY_CONTAINS(t3.singles_except_last_name, UPPER(__ibis_param_token__))
) AS last_name_tokens_filtered,
LIST_FILTER(
t3.suffix_tokens,
__ibis_param_token__ -> NOT ARRAY_CONTAINS(t3.singles_except_suffix, UPPER(__ibis_param_token__))
) AS suffix_tokens_filtered,
LIST_FILTER(
t3.nickname_tokens,
__ibis_param_token__ -> NOT ARRAY_CONTAINS(t3.singles_except_nickname, UPPER(__ibis_param_token__))
) AS nickname_tokens_filtered
FROM t3
), t5 AS (
SELECT
ARRAY_AGGR(t4.prefix_tokens_filtered, 'string_agg', ' ') AS prefix,
ARRAY_AGGR(t4.first_name_tokens_filtered, 'string_agg', ' ') AS first_name,
ARRAY_AGGR(t4.middle_name_tokens_filtered, 'string_agg', ' ') AS middle_name,
ARRAY_AGGR(t4.last_name_tokens_filtered, 'string_agg', ' ') AS last_name,
ARRAY_AGGR(t4.suffix_tokens_filtered, 'string_agg', ' ') AS suffix,
ARRAY_AGGR(t4.nickname_tokens_filtered, 'string_agg', ' ') AS nickname,
t4.prefix_tokens AS prefix_tokens,
t4.first_name_tokens AS first_name_tokens,
t4.middle_name_tokens AS middle_name_tokens,
t4.last_name_tokens AS last_name_tokens,
t4.suffix_tokens AS suffix_tokens,
t4.nickname_tokens AS nickname_tokens,
t4.prefix_single AS prefix_single,
t4.first_name_single AS first_name_single,
t4.middle_name_single AS middle_name_single,
t4.last_name_single AS last_name_single,
t4.suffix_single AS suffix_single,
t4.nickname_single AS nickname_single,
t4.singles_except_prefix AS singles_except_prefix,
t4.singles_except_first_name AS singles_except_first_name,
t4.singles_except_middle_name AS singles_except_middle_name,
t4.singles_except_last_name AS singles_except_last_name,
t4.singles_except_suffix AS singles_except_suffix,
t4.singles_except_nickname AS singles_except_nickname,
t4.prefix_tokens_filtered AS prefix_tokens_filtered,
t4.first_name_tokens_filtered AS first_name_tokens_filtered,
t4.middle_name_tokens_filtered AS middle_name_tokens_filtered,
t4.last_name_tokens_filtered AS last_name_tokens_filtered,
t4.suffix_tokens_filtered AS suffix_tokens_filtered,
t4.nickname_tokens_filtered AS nickname_tokens_filtered
FROM t4
), t6 AS (
SELECT
t5.prefix AS prefix,
t5.first_name AS first_name,
t5.middle_name AS middle_name,
t5.last_name AS last_name,
t5.suffix AS suffix,
t5.nickname AS nickname
FROM t5
), t7 AS (
SELECT
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
ARRAY_AGGR(
LIST_APPLY(
STR_SPLIT(REGEXP_REPLACE(t6.prefix, '[^a-zA-Z \\-'']', '', 'g'), ' '),
__ibis_param_t__ -> CASE
WHEN (
LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^A-Z]', '', 'g')) >= CAST(2 AS TINYINT)
AND LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^a-z]', '', 'g')) >= CAST(1 AS TINYINT)
OR LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^A-Z]', '', 'g')) >= CAST(1 AS TINYINT)
AND REGEXP_MATCHES(__ibis_param_t__, '[a-z]')
)
THEN __ibis_param_t__
ELSE CONCAT(UPPER(SUBSTR(__ibis_param_t__, 1, 1)), LOWER(SUBSTR(__ibis_param_t__, 2)))
END
),
'string_agg',
' '
),
'\\s+',
' ',
'g'
),
'(\\W) (\\W)',
'\\1\\2',
'g'
),
'(\\w) (\\W)',
'\\1\\2',
'g'
),
'(\\W) (\\w)',
'\\1\\2',
'g'
),
'
'
) AS prefix,
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
ARRAY_AGGR(
LIST_APPLY(
STR_SPLIT(REGEXP_REPLACE(t6.first_name, '[^a-zA-Z \\-'']', '', 'g'), ' '),
__ibis_param_t__ -> CASE
WHEN (
LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^A-Z]', '', 'g')) >= CAST(2 AS TINYINT)
AND LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^a-z]', '', 'g')) >= CAST(1 AS TINYINT)
OR LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^A-Z]', '', 'g')) >= CAST(1 AS TINYINT)
AND REGEXP_MATCHES(__ibis_param_t__, '[a-z]')
)
THEN __ibis_param_t__
ELSE CONCAT(UPPER(SUBSTR(__ibis_param_t__, 1, 1)), LOWER(SUBSTR(__ibis_param_t__, 2)))
END
),
'string_agg',
' '
),
'\\s+',
' ',
'g'
),
'(\\W) (\\W)',
'\\1\\2',
'g'
),
'(\\w) (\\W)',
'\\1\\2',
'g'
),
'(\\W) (\\w)',
'\\1\\2',
'g'
),
'
'
) AS first_name,
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
ARRAY_AGGR(
LIST_APPLY(
STR_SPLIT(REGEXP_REPLACE(t6.middle_name, '[^a-zA-Z \\-'']', '', 'g'), ' '),
__ibis_param_t__ -> CASE
WHEN (
LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^A-Z]', '', 'g')) >= CAST(2 AS TINYINT)
AND LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^a-z]', '', 'g')) >= CAST(1 AS TINYINT)
OR LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^A-Z]', '', 'g')) >= CAST(1 AS TINYINT)
AND REGEXP_MATCHES(__ibis_param_t__, '[a-z]')
)
THEN __ibis_param_t__
ELSE CONCAT(UPPER(SUBSTR(__ibis_param_t__, 1, 1)), LOWER(SUBSTR(__ibis_param_t__, 2)))
END
),
'string_agg',
' '
),
'\\s+',
' ',
'g'
),
'(\\W) (\\W)',
'\\1\\2',
'g'
),
'(\\w) (\\W)',
'\\1\\2',
'g'
),
'(\\W) (\\w)',
'\\1\\2',
'g'
),
'
'
) AS middle_name,
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
ARRAY_AGGR(
LIST_APPLY(
STR_SPLIT(REGEXP_REPLACE(t6.last_name, '[^a-zA-Z \\-'']', '', 'g'), ' '),
__ibis_param_t__ -> CASE
WHEN (
LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^A-Z]', '', 'g')) >= CAST(2 AS TINYINT)
AND LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^a-z]', '', 'g')) >= CAST(1 AS TINYINT)
OR LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^A-Z]', '', 'g')) >= CAST(1 AS TINYINT)
AND REGEXP_MATCHES(__ibis_param_t__, '[a-z]')
)
THEN __ibis_param_t__
ELSE CONCAT(UPPER(SUBSTR(__ibis_param_t__, 1, 1)), LOWER(SUBSTR(__ibis_param_t__, 2)))
END
),
'string_agg',
' '
),
'\\s+',
' ',
'g'
),
'(\\W) (\\W)',
'\\1\\2',
'g'
),
'(\\w) (\\W)',
'\\1\\2',
'g'
),
'(\\W) (\\w)',
'\\1\\2',
'g'
),
'
'
) AS last_name,
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
ARRAY_AGGR(
LIST_APPLY(
STR_SPLIT(REGEXP_REPLACE(t6.suffix, '[^a-zA-Z \\-'']', '', 'g'), ' '),
__ibis_param_t__ -> CASE
WHEN (
LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^A-Z]', '', 'g')) >= CAST(2 AS TINYINT)
AND LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^a-z]', '', 'g')) >= CAST(1 AS TINYINT)
OR LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^A-Z]', '', 'g')) >= CAST(1 AS TINYINT)
AND REGEXP_MATCHES(__ibis_param_t__, '[a-z]')
)
THEN __ibis_param_t__
ELSE CONCAT(UPPER(SUBSTR(__ibis_param_t__, 1, 1)), LOWER(SUBSTR(__ibis_param_t__, 2)))
END
),
'string_agg',
' '
),
'\\s+',
' ',
'g'
),
'(\\W) (\\W)',
'\\1\\2',
'g'
),
'(\\w) (\\W)',
'\\1\\2',
'g'
),
'(\\W) (\\w)',
'\\1\\2',
'g'
),
'
'
) AS suffix,
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
ARRAY_AGGR(
LIST_APPLY(
STR_SPLIT(REGEXP_REPLACE(t6.nickname, '[^a-zA-Z \\-'']', '', 'g'), ' '),
__ibis_param_t__ -> CASE
WHEN (
LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^A-Z]', '', 'g')) >= CAST(2 AS TINYINT)
AND LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^a-z]', '', 'g')) >= CAST(1 AS TINYINT)
OR LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^A-Z]', '', 'g')) >= CAST(1 AS TINYINT)
AND REGEXP_MATCHES(__ibis_param_t__, '[a-z]')
)
THEN __ibis_param_t__
ELSE CONCAT(UPPER(SUBSTR(__ibis_param_t__, 1, 1)), LOWER(SUBSTR(__ibis_param_t__, 2)))
END
),
'string_agg',
' '
),
'\\s+',
' ',
'g'
),
'(\\W) (\\W)',
'\\1\\2',
'g'
),
'(\\w) (\\W)',
'\\1\\2',
'g'
),
'(\\W) (\\w)',
'\\1\\2',
'g'
),
'
'
) AS nickname
FROM t6
)
SELECT
t8.prefix,
t8.first_name,
CASE
WHEN (
COALESCE(t8.nickname LIKE CONCAT(t8.middle_name, '%'), CAST(FALSE AS BOOLEAN))
AND NOT t8.first_name LIKE CONCAT(t8.middle_name, '%')
)
THEN t8.nickname
ELSE t8.middle_name
END AS middle_name,
t8.last_name,
t8.suffix,
t8.nickname
FROM (
SELECT
CASE
WHEN (
UPPER(t7.prefix) = 'JR'
)
THEN 'Jr'
WHEN (
UPPER(t7.prefix) = 'SR'
)
THEN 'Sr'
WHEN (
UPPER(t7.prefix) = 'PHD'
)
THEN 'PhD'
WHEN (
UPPER(t7.prefix) = 'MR'
)
THEN 'Mr'
WHEN (
UPPER(t7.prefix) = 'MRS'
)
THEN 'Mrs'
WHEN (
UPPER(t7.prefix) = 'MS'
)
THEN 'Ms'
WHEN (
UPPER(t7.prefix) = 'DR'
)
THEN 'Dr'
ELSE UPPER(t7.prefix)
END AS prefix,
t7.first_name AS first_name,
t7.middle_name AS middle_name,
t7.last_name AS last_name,
CASE
WHEN (
UPPER(t7.suffix) = 'JR'
)
THEN 'Jr'
WHEN (
UPPER(t7.suffix) = 'SR'
)
THEN 'Sr'
WHEN (
UPPER(t7.suffix) = 'PHD'
)
THEN 'PhD'
WHEN (
UPPER(t7.suffix) = 'MR'
)
THEN 'Mr'
WHEN (
UPPER(t7.suffix) = 'MRS'
)
THEN 'Mrs'
WHEN (
UPPER(t7.suffix) = 'MS'
)
THEN 'Ms'
WHEN (
UPPER(t7.suffix) = 'DR'
)
THEN 'Dr'
ELSE UPPER(t7.suffix)
END AS suffix,
t7.nickname AS nickname
FROM t7
) AS t8
On main (b5e6373485ffdc56ed5c0232b8d341ef895b62ce) I get
print(ibis.to_sql(expr)) # 1 sec
expr.cache() #8.4 sec
and I get the sql (but actually much longer, but github won’t let me post something that long), but you can see how this uses nested expressions, not CTEs:
SELECT
CASE
WHEN UPPER(
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
ARRAY_TO_STRING(
LIST_APPLY(
STR_SPLIT(
REGEXP_REPLACE(
ARRAY_TO_STRING(
LIST_FILTER(
STR_SPLIT("t0"."prefix", ' '),
__ibis_param_token__ -> NOT (
ARRAY_CONTAINS(
[CASE
WHEN ARRAY_LENGTH(
STR_SPLIT(
CASE
WHEN (
(
TRIM(
LOWER(
NULLIF(
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(REGEXP_REPLACE("t0"."first_name", '[^\x00-\x7F]+', '', 'g'), '[^A-Za-z0-9]+', ' ', 'g'),
'\s+',
' ',
'g'
),
'
'
),
''
)
),
'
'
) = TRIM(
LOWER(
NULLIF(
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(REGEXP_REPLACE("t0"."last_name", '[^\x00-\x7F]+', '', 'g'), '[^A-Za-z0-9]+', ' ', 'g'),
'\s+',
' ',
'g'
),
'
'
),
''
)
),
'
'
)
)
AND (
COALESCE(
ARRAY_LENGTH(
STR_SPLIT(
NULLIF(
TRIM(
REGEXP_REPLACE(
TRIM(
LOWER(
NULLIF(
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(REGEXP_REPLACE("t0"."first_name", '[^\x00-\x7F]+', '', 'g'), '[^A-Za-z0-9]+', ' ', 'g'),
'\s+',
' ',
'g'
),
'
'
),
''
)
),
'
'
),
'\s+',
' ',
'g'
),
'
'
),
''
),
' '
)
),
CAST(0 AS TINYINT)
) = CAST(1 AS TINYINT)
)
)
AND (
COALESCE(
ARRAY_LENGTH(
STR_SPLIT(
NULLIF(
TRIM(
REGEXP_REPLACE(
TRIM(
LOWER(
NULLIF(
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(REGEXP_REPLACE("t0"."last_name", '[^\x00-\x7F]+', '', 'g'), '[^A-Za-z0-9]+', ' ', 'g'),
'\s+',
' ',
'g'
),
'
'
),
''
)
),
'
'
),
'\s+',
' ',
'g'
),
'
'
),
''
),
' '
)
),
CAST(0 AS TINYINT)
) = CAST(1 AS TINYINT)
)
THEN NULL
ELSE "t0"."first_name"
END,
' '
)
) = CAST(1 AS TINYINT)
THEN UPPER(
CASE
WHEN (
(
TRIM(
LOWER(
NULLIF(
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(REGEXP_REPLACE("t0"."first_name", '[^\x00-\x7F]+', '', 'g'), '[^A-Za-z0-9]+', ' ', 'g'),
'\s+',
' ',
'g'
),
'
'
),
''
)
),
'
'
) = TRIM(
LOWER(
NULLIF(
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(REGEXP_REPLACE("t0"."last_name", '[^\x00-\x7F]+', '', 'g'), '[^A-Za-z0-9]+', ' ', 'g'),
'\s+',
' ',
'g'
),
'
'
),
''
)
),
'
'
)
)
AND (
COALESCE(
ARRAY_LENGTH(
STR_SPLIT(
NULLIF(
TRIM(
REGEXP_REPLACE(
TRIM(
LOWER(
NULLIF(
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(REGEXP_REPLACE("t0"."first_name", '[^\x00-\x7F]+', '', 'g'), '[^A-Za-z0-9]+', ' ', 'g'),
'\s+',
' ',
'g'
),
'
'
),
''
)
),
'
'
),
'\s+',
' ',
'g'
),
'
'
),
''
),
' '
)
),
CAST(0 AS TINYINT)
) = CAST(1 AS TINYINT)
)
)
AND (
COALESCE(
ARRAY_LENGTH(
STR_SPLIT(
NULLIF(
TRIM(
REGEXP_REPLACE(
TRIM(
LOWER(
NULLIF(
TRIM(
REGEXP_REPLACE(
REGEXP_REPLACE(REGEXP_REPLACE("t0"."last_name", '[^\x00-\x7F]+', '', 'g'), '[^A-Za-z0-9]+', ' ', 'g'),
'\s+',
' ',
'g'
),
'
'
),
''
)
),
'
'
),
'\s+',
' ',
'g'
),
'
'
),
''
),
' '
)
),
CAST(0 AS TINYINT)
) = CAST(1 AS TINYINT)
)
THEN NULL
ELSE "t0"."first_name"
END
)
ELSE NULL
END, CASE
WHEN ARRAY_LENGTH(STR_SPLIT("t0"."middle_name", ' ')) = CAST(1 AS TINYINT)
THEN UPPER("t0"."middle_name")
ELSE NULL
END, CASE
WHEN ARRAY_LENGTH(STR_SPLIT("t0"."last_name", ' ')) = CAST(1 AS TINYINT)
THEN UPPER("t0"."last_name")
ELSE NULL
END, CASE
WHEN ARRAY_LENGTH(STR_SPLIT("t0"."suffix", ' ')) = CAST(1 AS TINYINT)
THEN UPPER("t0"."suffix")
ELSE NULL
END, CASE
WHEN ARRAY_LENGTH(STR_SPLIT("t0"."nickname", ' ')) = CAST(1 AS TINYINT)
THEN UPPER("t0"."nickname")
ELSE NULL
END],
UPPER(__ibis_param_token__)
)
)
),
' '
),
'[^a-zA-Z \-'']',
'',
'g'
),
' '
),
__ibis_param_t__ -> CASE
WHEN (
(
LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^A-Z]', '', 'g')) >= CAST(2 AS TINYINT)
)
AND (
LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^a-z]', '', 'g')) >= CAST(1 AS TINYINT)
)
)
OR (
(
LENGTH(REGEXP_REPLACE(__ibis_param_t__, '[^A-Z]', '', 'g')) >= CAST(1 AS TINYINT)
)
AND REGEXP_MATCHES(__ibis_param_t__, '[a-z]')
)
THEN __ibis_param_t__
ELSE UPPER(
CASE
WHEN (
CAST(0 AS TINYINT) + 1
) >= 1
THEN SUBSTRING(__ibis_param_t__, CAST(0 AS TINYINT) + 1, CAST(1 AS TINYINT))
ELSE SUBSTRING(
__ibis_param_t__,
CAST(0 AS TINYINT) + 1 + LENGTH(__ibis_param_t__),
CAST(1 AS TINYINT)
)
END
) || LOWER(
CASE
WHEN (
CAST(1 AS TINYINT) + 1
) >= 1
THEN SUBSTRING(__ibis_param_t__, CAST(1 AS TINYINT) + 1, LENGTH(__ibis_param_t__))
ELSE SUBSTRING(
__ibis_param_t__,
CAST(1 AS TINYINT) + 1 + LENGTH(__ibis_param_t__),
LENGTH(__ibis_param_t__)
)
END
)
END
),
' '
),
'\s+',
' ',
'g'
),
'(\W) (\W)',
'\1\2',
'g'
),
'(\w) (\W)',
'\1\2',
'g'
),
'(\W) (\w)',
'\1\2',
'g'
),
'
.....
'
) AS "nickname"
FROM "ibis_cache_v5hid6mlqzejtef24szwfrzhty" AS "t0"
In actuality, I used to have an even more complex expression, but that literally took minutes to compile to SQL and I had to ctrl-c before it even finished. The .cache() also took longer than I had patience for. Then I threw some .cache()s in the middle of that chain to break it up, and this gave me the expression I have here. If I throw more .cache()s into the chain, the difference between the timings shrinks more.
What version of ibis are you using?
b5e6373485ffdc56ed5c0232b8d341ef895b62ce vs 8.0.0
What backend(s) are you using, if any?
duckdb
Relevant log output
No response
Code of Conduct
- I agree to follow this project’s Code of Conduct
About this issue
- Original URL
- State: open
- Created 4 months ago
- Comments: 36 (36 by maintainers)
sql_benchmark.zip
OK, here is a notebook that can repro the issue, and some test data of names (from the Alaska Division of Elections voterfile, this is public data)
I suspect the CTE generation, performance of generating SQL and the execution performance are all related. If we’re repeating the same subquery multiple times, the database may not automatically extract them into CTEs. We’ve already seen this with SQLite.