pairwise_dist_fl()

适用于:✅Azure 数据资源管理器

根据多个名义变量和数值变量计算实体之间的成对距离。

函数 pairwise_dist_fl() 是一个 UDF(用户定义的函数),它在考虑名义变量和数值变量的情况下,计算属于同一分区的数据点之间的多元距离。

  • 除了实体和分区名称以外,还会将所有字符串字段视为名义变量。 如果值不同,则距离等于 1;如果值相同,则距离等于 0。
  • 所有数值字段被视为数值变量。 这些字段通过转换为 z 评分进行规范化,距离计算为差的绝对值。 数据点之间的总多元距离计算为变量之间的距离的平均值。

距离接近 0 表示实体相似,距离大于 1 表示实体有差别。 同样,平均距离接近或大于 1 的实体表示它与分区中的其他许多实体不同,这表明可能存在离群值。

该函数的输出是同一分区下实体之间的成对距离。 可以按原样使用该函数来寻找相似或不同的对。 例如,距离最小的实体具有许多共同特征。 还可以轻松将它转换为距离矩阵(参阅用法示例),或者将它用作聚类或离群值检测算法的输入。

语法

pairwise_dist_fl(entity, partition)

详细了解语法约定

参数

客户 类型​​ 必需 说明
实体 string ✔️ 输入表列的名称,其中包含要计算其距离的实体的名称或 ID。
partition string ✔️ 包含分区或范围的输入表列的名称,用于计算同一分区下所有实体对的距离。

函数定义

可以通过将函数的代码嵌入为查询定义的函数,或将其创建为数据库中的存储函数来定义函数,如下所示:

使用以下 let 语句定义函数。 不需要任何权限。

重要

let 语句不能独立运行。 它必须后跟一个表格表达式语句。 若要运行 pairwise_dist_fl() 的工作示例,请参阅示例

let pairwise_dist_fl = (tbl:(*), id_col:string, partition_col:string)
{
    let generic_dist = (value1:dynamic, value2:dynamic) 
    {
        // Calculates the distance between two values; treats all strings as nominal values and numbers as numerical,
        // can be extended to other data types or tweaked by adding weights or changing formulas.
            iff(gettype(value1[0]) == "string", todouble(tostring(value1[0]) != tostring(value2[0])), abs(todouble(value1[0]) - todouble(value2[0])))
    };
    let T = (tbl | extend _entity = column_ifexists(id_col, ''), _partition = column_ifexists(partition_col, '') | project-reorder _entity, _partition);
    let sum_data = (
        // Calculates summary statistics to be used for normalization.
        T
        | project-reorder _entity
        | project _partition, p = pack_array(*)
        | mv-expand with_itemindex=idx p
        | summarize count(), avg(todouble(p)), stdev(todouble(p)) by _partition, idx
        | sort by _partition, idx asc
        | summarize make_list(avg_p), make_list(stdev_p) by _partition
    );
    let normalized_data = (
        // Performs normalization on numerical variables by substrcting mean and scaling by standard deviation. Other normalization techniques can be used
        // by adding metrics to previous function and using here.
        T
        | project _partition, p = pack_array(*)
        | join kind = leftouter (sum_data) on _partition
        | mv-apply p, list_avg_p, list_stdev_p on (
            extend normalized = iff((not(isnan(todouble(list_avg_p))) and (list_stdev_p > 0)), pack_array((todouble(p) - todouble(list_avg_p))/todouble(list_stdev_p)), p)
            | summarize a = make_list(normalized) by _partition
        )
        | project _partition, a
    );
    let dist_data = (
        // Calculates distances of included variables and sums them up to get a multivariate distance between all entities under the same partition.
        normalized_data
        | join kind = inner (normalized_data) on _partition
        | project entity = tostring(a[0]), entity1 = tostring(a1[0]), a = array_slice(a, 1, -1), a1 = array_slice(a1, 1, -1), _partition
        | mv-apply a, a1 on 
        (
            project d = generic_dist(pack_array(a), pack_array(a1))
            | summarize d = make_list(d)
        )
        | extend dist = bin((1.0*array_sum(d)-1.0)/array_length(d), 0.0001) // -1 cancels the artifact distance calculated between entity names appearing in the bag and normalizes by number of features        
        | project-away d
        | where entity != entity1
        | sort by _partition asc, entity asc, dist asc
    );
    dist_data
};
// Write your query to use the function here.

示例

以下示例使用 invoke 运算符运行函数。

若要使用查询定义的函数,请在嵌入的函数定义后调用它。

let pairwise_dist_fl = (tbl:(*), id_col:string, partition_col:string)
{
    let generic_dist = (value1:dynamic, value2:dynamic) 
    {
        // Calculates the distance between two values; treats all strings as nominal values and numbers as numerical,
        // can be extended to other data types or tweaked by adding weights or changing formulas.
            iff(gettype(value1[0]) == "string", todouble(tostring(value1[0]) != tostring(value2[0])), abs(todouble(value1[0]) - todouble(value2[0])))
    };
    let T = (tbl | extend _entity = column_ifexists(id_col, ''), _partition = column_ifexists(partition_col, '') | project-reorder _entity, _partition);
    let sum_data = (
        // Calculates summary statistics to be used for normalization.
        T
        | project-reorder _entity
        | project _partition, p = pack_array(*)
        | mv-expand with_itemindex=idx p
        | summarize count(), avg(todouble(p)), stdev(todouble(p)) by _partition, idx
        | sort by _partition, idx asc
        | summarize make_list(avg_p), make_list(stdev_p) by _partition
    );
    let normalized_data = (
        // Performs normalization on numerical variables by substrcting mean and scaling by standard deviation. Other normalization techniques can be used
        // by adding metrics to previous function and using here.
        T
        | project _partition, p = pack_array(*)
        | join kind = leftouter (sum_data) on _partition
        | mv-apply p, list_avg_p, list_stdev_p on (
            extend normalized = iff((not(isnan(todouble(list_avg_p))) and (list_stdev_p > 0)), pack_array((todouble(p) - todouble(list_avg_p))/todouble(list_stdev_p)), p)
            | summarize a = make_list(normalized) by _partition
        )
        | project _partition, a
    );
    let dist_data = (
        // Calculates distances of included variables and sums them up to get a multivariate distance between all entities under the same partition.
        normalized_data
        | join kind = inner (normalized_data) on _partition
        | project entity = tostring(a[0]), entity1 = tostring(a1[0]), a = array_slice(a, 1, -1), a1 = array_slice(a1, 1, -1), _partition
        | mv-apply a, a1 on 
        (
            project d = generic_dist(pack_array(a), pack_array(a1))
            | summarize d = make_list(d)
        )
        | extend dist = bin((1.0*array_sum(d)-1.0)/array_length(d), 0.0001) // -1 cancels the artifact distance calculated between entity names appearing in the bag and normalizes by number of features        
        | project-away d
        | where entity != entity1
        | sort by _partition asc, entity asc, dist asc
    );
    dist_data
};
//
let raw_data = datatable(name:string, gender: string, height:int, weight:int, limbs:int, accessory:string, type:string)[
    'Andy',     'M',    160,    80,     4,  'Hat',      'Person',
    'Betsy',    'F',    170,    70,     4,  'Bag',      'Person',
    'Cindy',    'F',    130,    30,     4,  'Hat',      'Person',
    'Dan',      'M',    190,    105,    4,  'Hat',      'Person',
    'Elmie',    'M',    110,    30,     4,  'Toy',      'Person',
    'Franny',   'F',    170,    65,     4,  'Bag',      'Person',
    'Godzilla', '?',    260,    210,    5,  'Tail',     'Person',
    'Hannie',   'F',    112,    28,     4,  'Toy',      'Person',
    'Ivie',     'F',    105,    20,     4,  'Toy',      'Person',
    'Johnnie',  'M',    107,    21,     4,  'Toy',      'Person',
    'Kyle',     'M',    175,    76,     4,  'Hat',      'Person',
    'Laura',    'F',    180,    70,     4,  'Bag',      'Person',
    'Mary',     'F',    160,    60,     4,  'Bag',      'Person',
    'Noah',     'M',    178,    90,     4,  'Hat',      'Person',
    'Odelia',   'F',    186,    76,     4,  'Bag',      'Person',
    'Paul',     'M',    158,    69,     4,  'Bag',      'Person',
    'Qui',      'F',    168,    62,     4,  'Bag',      'Person',
    'Ronnie',   'M',    108,    26,     4,  'Toy',      'Person',
    'Sonic',    'F',    52,     20,     6,  'Tail',     'Pet',
    'Tweety',   'F',    52,     20,     6,  'Tail',     'Pet' ,
    'Ulfie',    'M',    39,     29,     4,  'Wings',    'Pet',
    'Vinnie',   'F',    53,     22,     1,  'Tail',     'Pet',
    'Waldo',    'F',    51,     21,     4,  'Tail',     'Pet',
    'Xander',   'M',    50,     24,     4,  'Tail',     'Pet'
];
raw_data
| invoke pairwise_dist_fl('name', 'type')
| where _partition == 'Person' | sort by entity asc, entity1 asc
| evaluate pivot (entity, max(dist), entity1) | sort by entity1 asc

输出

entity1 Andy Betsy Cindy Dan Elmie Franny Godzilla Hannie ...
Andy 0.354 0.4125 0.1887 0.4843 0.3702 1.2087 0.6265 ...
Betsy 0.354 0.416 0.4708 0.6307 0.0161 1.2051 0.4872 ...
Cindy 0.4125 0.416 0.6012 0.3575 0.3998 1.4783 0.214 ...
Dan 0.1887 0.4708 0.6012 0.673 0.487 1.0199 0.8152 ...
Elmie 0.4843 0.6307 0.3575 0.673 0.6145 1.5502 0.1565 ...
Franny 0.3702 0.0161 0.3998 0.487 0.6145 1.2213 0.471 ...
Godzilla 1.2087 1.2051 1.4783 1.0199 1.5502 1.2213 1.5495 ...
Hannie 0.6265 0.4872 0.214 0.8152 0.1565 0.471 1.5495 ...
... ... ... ... ... ... ... ... ... ...

面对两种不同类型的实体,我们希望在考虑名义变量(例如性别或偏好配件)和数值变量(例如肢体数量、身高、和重量)的情况下,计算属于同一类型的实体之间的距离。 数值变量位于不同的标度,必须集中并缩放,这会由系统自动完成。 输出是同一分区下的实体对,计算了其多元距离。 可以直接分析输出,将其可视化为距离矩阵或散点图,或者通过计算每个实体的平均距离(值较大的实体表示全局离群值)将其用作离群值检测算法的输入数据。 例如,使用距离矩阵添加可选的可视化效果时,会得到示例中所示的表。 在示例中可以看到:

  • 某些实体对(Betsy 和 Franny)的距离值较小(接近 0),表示它们相似。
  • 某些实体对(Godzilla 和 Elmie)的距离值较大(1 或更大),表示它们不同。

可以进一步使用输出来计算每个实体的平均距离。 较大的平均距离可能表示存在全局离群值。 例如,我们可以看到,Godzilla 与其他实体之间的平均距离较大,表示它可能是一个全局离群值。