@inproceedings{06295a1f86f049b984e2d5bddb137241,
title = "Massively Scalable Parallel KMeans on the HPCC Systems Platform",
abstract = "Clustering algorithms are an important part of unsupervised machine learning. With Big Data, applying clustering algorithms such as KMeans has become a challenge due to the significantly larger volume of data and the computational complexity of the standard approach, Lloyd's algorithm. This work aims to tackle this challenge by transforming the classic clustering KMeans algorithm to be highly scalable and to be able to operate on Big Data. We leverage the distributed computing environment of the HPCC Systems platform. The presented KMeans algorithm adopts a hybrid parallelism method to achieve a massively scalable parallel KMeans. Our approach can save a significant amount of time of researchers and machine learning practitioners who train hundreds of models on a daily basis. The performance is evaluated with different size datasets and clusters and the results show a significant scalabilty of the scalable parallel KMeans algorithm.",
keywords = "HPCC Systems, High Performance Computing, Hybrid Parallelism, Machine Learning, Scalable KMeans",
author = "Lili Xu and Amy Apon and Flavio Villanustre and Roger Dev and Arjuna Chala",
note = "Publisher Copyright: {\textcopyright} 2019 IEEE.; 4th International Conference on Computational Systems and Information Technology for Sustainable Solution, CSITSS 2019 ; Conference date: 20-12-2019 Through 21-12-2019",
year = "2019",
month = dec,
doi = "10.1109/CSITSS47250.2019.9031047",
language = "Ingl{\'e}s",
series = "CSITSS 2019 - 2019 4th International Conference on Computational Systems and Information Technology for Sustainable Solution, Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "CSITSS 2019 - 2019 4th International Conference on Computational Systems and Information Technology for Sustainable Solution, Proceedings",
}