paper_2018.bib

@inproceedings{Mills_ICDM_2018,
  author = {Richard Tran Mills and Vamsi Sripathi and Jitendra Kumar and Sarat Sreepathi and Forrest M. Hoffman and William W. Hargrove},
  title = {Parallel $k$-means Clustering of Geospatial Data Sets Using {M}anycore {CPU} Architectures},
  booktitle = {Proceedings of the 2018 {IEEE} International Conference on Data Mining Workshops ({ICDMW} 2018)},
  organization = {Institute of Electrical and Electronics Engineers (IEEE)},
  publisher = {Conference Publishing Services (CPS)},
  doi = {10.1109/ICDMW.2018.00118},
  note = {\url{https://doi.org/10.1109/ICDMW.2018.00118}},
  day = 17,
  month = nov,
  year = 2018,
  abstract = {The increasing availability of high-resolution geospatiotemporal data sets from sources such as observatory networks, remote sensing platforms, and computational Earth system models has opened new possibilities for knowledge discovery and mining of weather, climate, ecological, and other geoscientific data sets fused from disparate sources. Many of the standard tools used on individual workstations are impractical for the analysis and synthesis of data sets of this size; however, new algorithmic approaches that can effectively utilize the complex memory hierarchies and the extremely high levels of parallelism available in state-of-the-art high-performance computing platforms can enable such analysis. Here, we describe \textit{pKluster}, an open-source tool we have developed for accelerated $k$-means clustering of geospatial and geospatiotemporal data, and discuss algorithmic modifications and code optimizations we have made to enable it to effectively use parallel machines based on novel CPU architectures---such as the Intel Knights Landing Xeon Phi and Skylake Xeon processors---with many cores and hardware threads, and employing significant single instruction, multiple data (SIMD) parallelism. We outline some applications of the code in ecology and climate science contexts and present a detailed discussion of the performance of the code for one such application, LiDAR-derived vertical vegetation structure classification.}
}

@inproceedings{Lu_ICDM_2018,
  author = {Yuping Lu and Jitendra Kumar and Nathan Collier and Bhargavi Krishna and Michael Langston},
  title = {Detecting outliers in streaming time series data from ARM distributed sensors},
  booktitle = {Proceedings of the 2018 {IEEE} International Conference on Data Mining Workshops ({ICDMW} 2018)},
  organization = {Institute of Electrical and Electronics Engineers (IEEE)},
  publisher = {Conference Publishing Services (CPS)},
  doi = {10.1109/ICDMW.2018.00117},
  note = {\url{https://doi.org/10.1109/ICDMW.2018.00117}},
  day = 17,
  month = nov,
  year = 2018
}

@inproceedings{Langford_ICDM_2018,
  author = {Zachary L. Langford and Jitendra Kumar and Forrest M. Hoffman},
  title = {Wildfire Mapping in {I}nterior {A}laska Using Deep Neural Networks on Imbalanced Datasets},
  booktitle = {Proceedings of the 2018 {IEEE} International Conference on Data Mining Workshops ({ICDMW} 2018)},
  organization = {Institute of Electrical and Electronics Engineers (IEEE)},
  publisher = {Conference Publishing Services (CPS)},
  doi = {10.1109/ICDMW.2018.00116},
  note = {\url{https://doi.org/10.1109/ICDMW.2018.00116}},
  day = 17,
  month = nov,
  year = 2018,
  abstract = {Wildfires are the dominant disturbance impacting many regions in Alaska and are expected to intensify due to climate change. Accurate tracking and quantification of wildfires are important for climate modeling and ecological studies in this region. Remote sensing platforms (e.g., MODIS, Landsat) are valuable tools for mapping wildfire events (burned or burning areas) in Alaska. Deep neural networks (DNN) have exhibited superior performance in many classification problems, such as high-dimensional remote sensing data. Detection of wildfires is an imbalanced classification problem where one class contains a much smaller or larger sample size, and performance of DNNs can decline. We take a known weight-selection strategy during DNN training and apply those weights to MODIS variables (e.g., NDVI, surface reflectance) for binary classification (i.e., wildfire or no-wildfire) across Alaska during the 2004 wildfire year, when Alaska experienced a record number of large wildfires. The method splits the input training data into subsets, one for training the DNN to update weights and the other for performance validation to select the weights based on the best validation-loss score. This approach was applied to two sampled datasets, such as where the no-wildfire class can significantly outweigh the wildfire class. The normal DNN training strategy was unable to map wildfires for the highly imbalanced dataset; however, the weight-selection strategy was able to map wildfires very accurately (0.96 recall score for 78,702 wildfire pixels ($500 \times 500$~m)).}
}

@inproceedings{Devarakonda_IEEEBigData_2018,
  author = {R. {Devarakonda} and M. {Giansiracusa} and J. {Kumar}},
  booktitle = {2018 IEEE International Conference on Big Data (Big Data)},
  title = {Machine Learning and Social Media to Mine and Disseminate Big Scientific Data},
  year = {2018},
  volume = {},
  number = {},
  pages = {5312-5315},
  keywords = {data mining;information dissemination;learning (artificial intelligence);natural language processing;query processing;relational databases;social networking (online);SQL;database queries;social media;natural language processing;Atmospheric Radiation Measurement Data Center;Oak Ridge National Laboratory;ORNL;easy search;environmental data centers;machine learning framework;data streams;data-intensive projects;data discovery;big scientific data;scientific databases;database languages;Structured Query Language;SQL language;SQL queries;relational databases;structured databases;data retrieval;data mining;data dissemination;Natural language processing;Twitter;Databases;Laboratories;Structured Query Language;machine learning;natural language processing;social media interaction;scientific data mining;stream pipelining},
  doi = {10.1109/BigData.2018.8622470},
  note = {\url{https://doi.org/10.1109/BigData.2018.8622470}},
  issn = {},
  month = {Dec}
}