Spark机器学习
K-均值(K-mean)聚类 目的:最小化所有类簇中的方差之和
类簇内方差和(WCSS,within cluster sum of squared errors)fuzzy K-means层次聚类(hierarchical culstering)
凝聚聚类(agglomerative clustering)分列式聚类(divisive clustering)1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy Story (1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
unknown|0Action|1Adventure|2Animation|3Children's|4
val genreMap = genres.filter(!_.isEmpty).map(line => line.split("\\|")).map(array => (array(1), array(0))).collectAsMap println(genreMap)Map(2 -> Adventure, 5 -> Comedy, 12 -> Musical, 15 -> Sci-Fi, 8 -> Drama, 18 -> Western, 7 -> Documentary, 17 -> War, 1 -> Action, 4 -> Children's, 11 -> Horror, 14 -> Romance, 6 -> Crime, 0 -> unknown, 9 -> Fantasy, 16 -> Thriller, 3 -> Animation, 10 -> Film-Noir, 13 -> Mystery)
val titlesAndGenres = movies.map(_.split("\\|")).map { array => val genres = array.toSeq.slice(5, array.size) val genresAssigned = genres.zipWithIndex.filter { case (g, idx) => g == "1" }.map { case (g, idx) => genreMap(idx.toString) } (array(0).toInt, (array(1), genresAssigned)) } println(titlesAndGenres.first)(1,(Toy Story (1995),ArrayBuffer(Animation, Children's, Comedy)))
Movie factors mean: [-0.2955253575969453,-0.2894017158566661,-0.2319822953560126,0.002917648331681182,0.16553261386128745,-0.21992534888550966,-0.03380127825698873,-0.20603088790834398,-0.15619861138444532,-0.028497688228493936,0.16963530616257805,0.14388067884376599,-0.0017092576491059591,-0.09626837303920982,-0.06064127207772349,-0.06045518556672421,0.18751923345914,0.2399624456229244,0.26532560070303446,0.05541910564428427,-0.015674971004015527,0.011168436718639107,-0.04741294377492476,0.11574693735017375,0.1289987655696671,0.44134038441588025,-0.5900688729554584,-0.03768358034266212,0.008887881298347921,0.20425041421871237,-0.20022602485759528,0.2697605004694663,0.10361325058554109,0.210277185021123,-0.22259636797095098,0.1174637780755839,-0.13688720440722232,0.03767713022869551,-0.0558405163043045,-0.12431407617904076,-0.046046222769634326,-0.20808223343120555,-0.3272035383525689,-0.2069514616509938,-0.0754149005227642,0.0856900404902959,0.06164062157888312,-0.06518672356795488,-0.32742867325628294,0.20285276122166002]
Movie factors variance: [0.04665858952268178,0.0348699710379137,0.03579479789217705,0.029901233287017822,0.03584001448747631,0.030266373892482327,0.03305718879524182,0.02686945635500392,0.025320493042287756,0.024438753389466123,0.02575390293411112,0.028972744965903668,0.02827972104601086,0.033613911928577246,0.033662480558315735,0.02833746789842838,0.02994391891556463,0.04394221701123749,0.03435422169469965,0.03561654218653061,0.02682492748548697,0.029604704664741063,0.024673702738648908,0.030823597518982247,0.028442111141151614,0.03613743157595084,0.05449475590178096,0.042012621165520236,0.028896894307921802,0.033241681676696215,0.02633619851965672,0.035711481477364235,0.025481764774248593,0.028764828375131987,0.0272530758482775,0.029673397451581197,0.03148963131813852,0.03622387708462999,0.02816170323774573,0.033017372289155716,0.028641152942670445,0.02904189221086495,0.030234076747492195,0.04509970117296679,0.029449883713724593,0.02756067270740738,0.04139144468263727,0.030245838006703486,0.03131689738936245,0.03378427027186054]
User factors mean: [-0.49867551877683824,-0.44459915531291566,-0.36051481893169574,0.017151848798776233,0.3213603826583396,-0.3196901675619378,-0.07224328943358119,-0.29041744434669287,-0.22727507102332345,-0.03178720415880569,0.25862894293461186,0.22888402894019788,0.012327199821030293,-0.14990885838046697,-0.1281515413333295,-0.09431829455241085,0.2679196025618735,0.38335691552119355,0.34604572069945905,0.11174974992685119,-0.02180706957147866,0.005610012764397019,-0.08491397316018835,0.20231176194774866,0.17161396689284497,0.6398163397864598,-0.8673987425745228,-0.10283010351171737,0.028330477167842844,0.30443187793692406,-0.301912604145753,0.4138735923728453,0.2256847560401456,0.285848070636566,-0.2605794171061,0.22449121780469036,-0.23998269543836812,0.036814175516996,-0.0679476059994798,-0.14427917258340417,-0.0833994179810923,-0.3582564875155623,-0.4564982359022274,-0.358039104184582,-0.12317214145750788,0.15037235678650748,0.06053431528961892,-0.06831269426575506,-0.5051800522709825,0.3151860279443428]
User factors variance: [0.04443021235777847,0.03057049227554263,0.03697914530468325,0.037294292401115044,0.035107560835514376,0.03456306589890253,0.03582898189508532,0.028049150877918694,0.032265557628909904,0.033678972590911474,0.03107771568048393,0.03456737756860466,0.035184013404102404,0.04264936219513472,0.04120372326054623,0.03364277736735525,0.040292531435941865,0.04006060147670186,0.03950365342886879,0.04560154697337463,0.030231562691714647,0.041120732342916626,0.03118953330313852,0.03508187607535198,0.03228272297984499,0.03603017959168009,0.04917534366846078,0.059425007832722164,0.03161224197770566,0.04211986001194535,0.02891350391303218,0.05259534335774597,0.03483271651803892,0.040489027307905476,0.03125884956067426,0.0379774604293261,0.035875980098136084,0.043509576391072786,0.03338290356822281,0.03675372599031079,0.03379511912889908,0.02951817116168268,0.0430380317818896,0.04214608566562065,0.03376833767379957,0.0314188022932176,0.048481326691437995,0.03724671278315033,0.034103714500646594,0.046064657833824844]
通过交叉验证选择K
val trainTestSplitMovies = movieVectors.randomSplit(Array(0.6, 0.4), 123) val trainMovies = trainTestSplitMovies(0) val testMovies = trainTestSplitMovies(1) val costsMovies = Seq(2, 3, 4, 5, 10, 20).map { k => (k, KMeans.train(trainMovies, numIterations, k, numRuns).computeCost(testMovies)) } println("Movie clustering cross-validation:") costsMovies.foreach { case (k, cost) => println(f"WCSS for K=$k id $cost%2.2f") }WCSS for K=2 id 870.36WCSS for K=3 id 858.28WCSS for K=4 id 847.40WCSS for K=5 id 840.71WCSS for K=10 id 842.58WCSS for K=20 id 843.24
val trainTestSplitUsers = userVectors.randomSplit(Array(0.6, 0.4), 123) val trainUsers = trainTestSplitUsers(0) val testUsers = trainTestSplitUsers(1) val costsUsers = Seq(2, 3, 4, 5, 10, 20).map { k => (k, KMeans.train(trainUsers, numIterations, k, numRuns).computeCost(testUsers)) } println("User clustering cross-validation:") costsUsers.foreach { case (k, cost) => println(f"WCSS for K=$k id $cost%2.2f") }WCSS for K=2 id 573.50WCSS for K=3 id 580.33WCSS for K=4 id 574.84WCSS for K=5 id 575.61WCSS for K=10 id 586.05WCSS for K=20 id 577.01
