需求:2 个 list(alist,blist),alist 每个值与 blist 每个值做字符串相似度计算,两个 list 数量级为 20 万 下面为 python 和 go 的代码片段 python:
# 计算两个字符串的相似度
def similar(a, b):
"""计算两个字符串的相似度。如果有一个是 None ,则返回 0 。"""
if a is None or b is None:
return 0
similarity = fuzz.token_set_ratio(a, b.lower()) / 100
print("similar a:", a, ",", "b:", b, ", similarity:", similarity)
return similarity
def compute_similarity(args):
record_name, name = args
return similar(record_name, name), name
# 更新数据库记录
def update_database(cursor, name_mapping, csv_data):
update_sql = "UPDATE tweb_fingerprint_test SET factory = %s WHERE uuid = %s"
num = 0
# 创建一个反向映射,使我们可以快速地通过名称查找 UUID
name_to_uuid = defaultdict(list)
for uuid, names in name_mapping.items():
for name in names:
if name: # 检查 name 是否为 None 或空
name_to_uuid[name].append(uuid)
updates = []
with ProcessPoolExecutor() as executor:
for row in csv_data:
vendor_name = row.get("vendor")
record_name = row.get("name")
print("record_name:", record_name)
if (
vendor_name is None
or record_name is None
or vendor_name in ["未知", "None"]
):
continue # 跳过这行数据
# 直接查找名称
uuids_to_update = name_to_uuid.get(record_name, [])
# 如果没有直接匹配,尝试查找相似度超过 98%的名称
if not uuids_to_update:
tasks = [(record_name, name) for name in name_to_uuid]
results = executor.map(compute_similarity, tasks)
uuids_to_update.extend(
name_to_uuid[name]
for similarity, name in results
if similarity > 0.98
)
# 如果找到 UUID ,加入到更新列表中
for uuid_to_update in uuids_to_update:
updates.append((vendor_name, uuid_to_update))
# 批量更新
if updates:
cursor.executemany(update_sql, updates)
num = len(updates)
# 返回更新的记录数
return num
go:
// Similar calculates the similarity between two strings
func Similar(a, b string) float64 {
return smetrics.JaroWinkler(a, b, 0.7, 4)
}
// UpdateDatabase updates the database with the new vendor information
// UpdateDatabase updates the database with the new vendor information
func UpdateDatabase(db *sql.DB, vendors map[string]Vendor, records []CSVRecord) (int, error) {
fmt.Println("records", len(records))
fmt.Println("vendors", len(vendors))
stmt, err := db.Prepare("UPDATE tweb_fingerprint SET factory = ? WHERE uuid = ?")
if err != nil {
return 0, err
}
defer stmt.Close()
var wg sync.WaitGroup
updates := make(chan Updatedata, len(records))
for _, record := range records {
wg.Add(1)
go func(record CSVRecord) {
defer wg.Done()
// fmt.Println(record.Name)
for _, vendor := range vendors {
if record.Name == vendor.Name.String || Similar(record.Name, vendor.Name.String) > SimilarityThreshold {
updates <- Updatedata{
UUID: vendor.UUID,
Factory: record.Vendor,
}
}
}
}(record)
}
go func() {
wg.Wait()
close(updates)
}()
count := 0
for update := range updates {
fmt.Println("update:", update)
if _, err := stmt.Exec(update.Factory, update.UUID); err != nil {
return count, err
}
count++
}
return count, nil
}
1
Baloneo 2023-12-13 16:27:22 +08:00
快多少?
|