K值聚类的时候,需要自己指定cluster的数目。
这个cluster数目一般是通过canopy算法进行预处理来确定的。
canopy具体描述可以参考这里。
下面是 golang语言的一个实现(对经纬度距离计算进行cluster)。
package main import ( "fmt" "math" ) const ( EARTH_RADIUS = 6371 ) type Point struct { lat float64 lng float64 } func Pop(points []Point) (p Point, newPoints []Point) { if len(points) > 0 { p = points[0] newPoints = points[1:] } return } func Push(p Point, points []Point) []Point { points = append(points, p) return points } // Calculates the Haversine distance between two points in kilometers. // Original Implementation from: http://www.movable-type.co.uk/scripts/latlong.html func GreatCircleDistance(p1, p2 Point) float64 { dLat := (p2.lat - p1.lat) * (math.Pi / 180.0) dLon := (p2.lng - p1.lng) * (math.Pi / 180.0) lat1 := p1.lat * (math.Pi / 180.0) lat2 := p2.lat * (math.Pi / 180.0) a1 := math.Sin(dLat/2) * math.Sin(dLat/2) a2 := math.Sin(dLon/2) * math.Sin(dLon/2) * math.Cos(lat1) * math.Cos(lat2) a := a1 + a2 c := 2 * math.Atan2(math.Sqrt(a), math.Sqrt(1-a)) return EARTH_RADIUS * c } /* while(没有标记的数据点){ 选择一个没有强标记的数据点p 把p看作一个新Canopy c的中心 离p距离<x1的所有点都认为在c中,给这些点做上弱标记 //纳入canopy,有可能会纳入其它canopy 离p距离<x2的所有点都认为在c中,给这些点做上强标记 //不会再纳入其它canopy } */ //目前只实现了经纬度以及经纬度的距离计算,这里可以是一个向量 func CanopyCluster(points []Point, x1, x2 float64) { var tmp []Point var cluster [][]Point for len(points) > 0 { var center Point center, points = Pop(points) index := len(cluster) var cpList []Point cpList = append(cpList, center) cluster = append(cluster, cpList) var cur Point for len(points) > 0 { cur, points = Pop(points) distance := GreatCircleDistance(center, cur) if distance <= x1 { cluster[index] = append(cluster[index], cur) if distance > x2 { tmp = Push(cur, tmp) } } else { tmp = Push(cur, tmp) } } fmt.Printf("current number of items in this canopy %d\n", center) var t []Point points = tmp tmp = t } for k, c := range cluster { fmt.Println("canopy", k, "has", len(c), "items:") for _, v := range c { fmt.Println("\t", v.lat, v.lng) } } } func main() { pointsList := []Point{ {34.28637, -110.12059}, {34.28638, -110.1206}, {34.29077, -110.12078}, {34.29111, -110.11941}, {34.29113, -110.11938}, {34.29116, -110.1194}, {34.29145, -110.12043}, {34.29146, -110.12063}, {34.29154, -110.11873}, {34.3141, -110.11556}, {34.31411, -110.11557}, {34.31411, -110.11556}, {34.31412, -110.11556}, {34.31412, -110.11557}, {34.31415, -110.11552}, {34.31415, -110.11556}, } CanopyCluster(pointsList, 1.0, 0.8) }
时间: 2024-10-13 14:44:42