pairwise_dist_fl()
适用于:✅Azure 数据资源管理器
根据多个名义变量和数值变量计算实体之间的成对距离。
函数 pairwise_dist_fl()
是一个 UDF(用户定义的函数),它在考虑名义变量和数值变量的情况下,计算属于同一分区的数据点之间的多元距离。
- 除了实体和分区名称以外,还会将所有字符串字段视为名义变量。 如果值不同,则距离等于 1;如果值相同,则距离等于 0。
- 所有数值字段被视为数值变量。 这些字段通过转换为 z 评分进行规范化,距离计算为差的绝对值。 数据点之间的总多元距离计算为变量之间的距离的平均值。
距离接近 0 表示实体相似,距离大于 1 表示实体有差别。 同样,平均距离接近或大于 1 的实体表示它与分区中的其他许多实体不同,这表明可能存在离群值。
该函数的输出是同一分区下实体之间的成对距离。 可以按原样使用该函数来寻找相似或不同的对。 例如,距离最小的实体具有许多共同特征。 还可以轻松将它转换为距离矩阵(参阅用法示例),或者将它用作聚类或离群值检测算法的输入。
语法
pairwise_dist_fl(
entity, partition)
详细了解语法约定。
参数
客户 | 类型 | 必需 | 说明 |
---|---|---|---|
实体 | string |
✔️ | 输入表列的名称,其中包含要计算其距离的实体的名称或 ID。 |
partition | string |
✔️ | 包含分区或范围的输入表列的名称,用于计算同一分区下所有实体对的距离。 |
函数定义
可以通过将函数的代码嵌入为查询定义的函数,或将其创建为数据库中的存储函数来定义函数,如下所示:
使用以下 let 语句定义函数。 不需要任何权限。
let pairwise_dist_fl = (tbl:(*), id_col:string, partition_col:string)
{
let generic_dist = (value1:dynamic, value2:dynamic)
{
// Calculates the distance between two values; treats all strings as nominal values and numbers as numerical,
// can be extended to other data types or tweaked by adding weights or changing formulas.
iff(gettype(value1[0]) == "string", todouble(tostring(value1[0]) != tostring(value2[0])), abs(todouble(value1[0]) - todouble(value2[0])))
};
let T = (tbl | extend _entity = column_ifexists(id_col, ''), _partition = column_ifexists(partition_col, '') | project-reorder _entity, _partition);
let sum_data = (
// Calculates summary statistics to be used for normalization.
T
| project-reorder _entity
| project _partition, p = pack_array(*)
| mv-expand with_itemindex=idx p
| summarize count(), avg(todouble(p)), stdev(todouble(p)) by _partition, idx
| sort by _partition, idx asc
| summarize make_list(avg_p), make_list(stdev_p) by _partition
);
let normalized_data = (
// Performs normalization on numerical variables by substrcting mean and scaling by standard deviation. Other normalization techniques can be used
// by adding metrics to previous function and using here.
T
| project _partition, p = pack_array(*)
| join kind = leftouter (sum_data) on _partition
| mv-apply p, list_avg_p, list_stdev_p on (
extend normalized = iff((not(isnan(todouble(list_avg_p))) and (list_stdev_p > 0)), pack_array((todouble(p) - todouble(list_avg_p))/todouble(list_stdev_p)), p)
| summarize a = make_list(normalized) by _partition
)
| project _partition, a
);
let dist_data = (
// Calculates distances of included variables and sums them up to get a multivariate distance between all entities under the same partition.
normalized_data
| join kind = inner (normalized_data) on _partition
| project entity = tostring(a[0]), entity1 = tostring(a1[0]), a = array_slice(a, 1, -1), a1 = array_slice(a1, 1, -1), _partition
| mv-apply a, a1 on
(
project d = generic_dist(pack_array(a), pack_array(a1))
| summarize d = make_list(d)
)
| extend dist = bin((1.0*array_sum(d)-1.0)/array_length(d), 0.0001) // -1 cancels the artifact distance calculated between entity names appearing in the bag and normalizes by number of features
| project-away d
| where entity != entity1
| sort by _partition asc, entity asc, dist asc
);
dist_data
};
// Write your query to use the function here.
示例
以下示例使用 invoke 运算符运行函数。
若要使用查询定义的函数,请在嵌入的函数定义后调用它。
let pairwise_dist_fl = (tbl:(*), id_col:string, partition_col:string)
{
let generic_dist = (value1:dynamic, value2:dynamic)
{
// Calculates the distance between two values; treats all strings as nominal values and numbers as numerical,
// can be extended to other data types or tweaked by adding weights or changing formulas.
iff(gettype(value1[0]) == "string", todouble(tostring(value1[0]) != tostring(value2[0])), abs(todouble(value1[0]) - todouble(value2[0])))
};
let T = (tbl | extend _entity = column_ifexists(id_col, ''), _partition = column_ifexists(partition_col, '') | project-reorder _entity, _partition);
let sum_data = (
// Calculates summary statistics to be used for normalization.
T
| project-reorder _entity
| project _partition, p = pack_array(*)
| mv-expand with_itemindex=idx p
| summarize count(), avg(todouble(p)), stdev(todouble(p)) by _partition, idx
| sort by _partition, idx asc
| summarize make_list(avg_p), make_list(stdev_p) by _partition
);
let normalized_data = (
// Performs normalization on numerical variables by substrcting mean and scaling by standard deviation. Other normalization techniques can be used
// by adding metrics to previous function and using here.
T
| project _partition, p = pack_array(*)
| join kind = leftouter (sum_data) on _partition
| mv-apply p, list_avg_p, list_stdev_p on (
extend normalized = iff((not(isnan(todouble(list_avg_p))) and (list_stdev_p > 0)), pack_array((todouble(p) - todouble(list_avg_p))/todouble(list_stdev_p)), p)
| summarize a = make_list(normalized) by _partition
)
| project _partition, a
);
let dist_data = (
// Calculates distances of included variables and sums them up to get a multivariate distance between all entities under the same partition.
normalized_data
| join kind = inner (normalized_data) on _partition
| project entity = tostring(a[0]), entity1 = tostring(a1[0]), a = array_slice(a, 1, -1), a1 = array_slice(a1, 1, -1), _partition
| mv-apply a, a1 on
(
project d = generic_dist(pack_array(a), pack_array(a1))
| summarize d = make_list(d)
)
| extend dist = bin((1.0*array_sum(d)-1.0)/array_length(d), 0.0001) // -1 cancels the artifact distance calculated between entity names appearing in the bag and normalizes by number of features
| project-away d
| where entity != entity1
| sort by _partition asc, entity asc, dist asc
);
dist_data
};
//
let raw_data = datatable(name:string, gender: string, height:int, weight:int, limbs:int, accessory:string, type:string)[
'Andy', 'M', 160, 80, 4, 'Hat', 'Person',
'Betsy', 'F', 170, 70, 4, 'Bag', 'Person',
'Cindy', 'F', 130, 30, 4, 'Hat', 'Person',
'Dan', 'M', 190, 105, 4, 'Hat', 'Person',
'Elmie', 'M', 110, 30, 4, 'Toy', 'Person',
'Franny', 'F', 170, 65, 4, 'Bag', 'Person',
'Godzilla', '?', 260, 210, 5, 'Tail', 'Person',
'Hannie', 'F', 112, 28, 4, 'Toy', 'Person',
'Ivie', 'F', 105, 20, 4, 'Toy', 'Person',
'Johnnie', 'M', 107, 21, 4, 'Toy', 'Person',
'Kyle', 'M', 175, 76, 4, 'Hat', 'Person',
'Laura', 'F', 180, 70, 4, 'Bag', 'Person',
'Mary', 'F', 160, 60, 4, 'Bag', 'Person',
'Noah', 'M', 178, 90, 4, 'Hat', 'Person',
'Odelia', 'F', 186, 76, 4, 'Bag', 'Person',
'Paul', 'M', 158, 69, 4, 'Bag', 'Person',
'Qui', 'F', 168, 62, 4, 'Bag', 'Person',
'Ronnie', 'M', 108, 26, 4, 'Toy', 'Person',
'Sonic', 'F', 52, 20, 6, 'Tail', 'Pet',
'Tweety', 'F', 52, 20, 6, 'Tail', 'Pet' ,
'Ulfie', 'M', 39, 29, 4, 'Wings', 'Pet',
'Vinnie', 'F', 53, 22, 1, 'Tail', 'Pet',
'Waldo', 'F', 51, 21, 4, 'Tail', 'Pet',
'Xander', 'M', 50, 24, 4, 'Tail', 'Pet'
];
raw_data
| invoke pairwise_dist_fl('name', 'type')
| where _partition == 'Person' | sort by entity asc, entity1 asc
| evaluate pivot (entity, max(dist), entity1) | sort by entity1 asc
输出
entity1 | Andy | Betsy | Cindy | Dan | Elmie | Franny | Godzilla | Hannie | ... |
---|---|---|---|---|---|---|---|---|---|
Andy | 0.354 | 0.4125 | 0.1887 | 0.4843 | 0.3702 | 1.2087 | 0.6265 | ... | |
Betsy | 0.354 | 0.416 | 0.4708 | 0.6307 | 0.0161 | 1.2051 | 0.4872 | ... | |
Cindy | 0.4125 | 0.416 | 0.6012 | 0.3575 | 0.3998 | 1.4783 | 0.214 | ... | |
Dan | 0.1887 | 0.4708 | 0.6012 | 0.673 | 0.487 | 1.0199 | 0.8152 | ... | |
Elmie | 0.4843 | 0.6307 | 0.3575 | 0.673 | 0.6145 | 1.5502 | 0.1565 | ... | |
Franny | 0.3702 | 0.0161 | 0.3998 | 0.487 | 0.6145 | 1.2213 | 0.471 | ... | |
Godzilla | 1.2087 | 1.2051 | 1.4783 | 1.0199 | 1.5502 | 1.2213 | 1.5495 | ... | |
Hannie | 0.6265 | 0.4872 | 0.214 | 0.8152 | 0.1565 | 0.471 | 1.5495 | ... | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
面对两种不同类型的实体,我们希望在考虑名义变量(例如性别或偏好配件)和数值变量(例如肢体数量、身高、和重量)的情况下,计算属于同一类型的实体之间的距离。 数值变量位于不同的标度,必须集中并缩放,这会由系统自动完成。 输出是同一分区下的实体对,计算了其多元距离。 可以直接分析输出,将其可视化为距离矩阵或散点图,或者通过计算每个实体的平均距离(值较大的实体表示全局离群值)将其用作离群值检测算法的输入数据。 例如,使用距离矩阵添加可选的可视化效果时,会得到示例中所示的表。 在示例中可以看到:
- 某些实体对(Betsy 和 Franny)的距离值较小(接近 0),表示它们相似。
- 某些实体对(Godzilla 和 Elmie)的距离值较大(1 或更大),表示它们不同。
可以进一步使用输出来计算每个实体的平均距离。 较大的平均距离可能表示存在全局离群值。 例如,我们可以看到,Godzilla 与其他实体之间的平均距离较大,表示它可能是一个全局离群值。