Publications
2024 |
R. Pérez-Chacón and G. Asencio-Cortés and A. Troncoso and F. Martínez-Álvarez Pattern sequence-based algorithm for multivariate big data time series forecasting: Application to electricity consumption (Journal Article) Future Generation Computer Systems, 154 , pp. 397-412, 2024. (Abstract | Links | BibTeX | Tags: big data, energy, forecasting, time series) @article{PEREZ24, title = {Pattern sequence-based algorithm for multivariate big data time series forecasting: Application to electricity consumption}, author = {R. Pérez-Chacón and G. Asencio-Cortés and A. Troncoso and F. Martínez-Álvarez}, url = {https://www.sciencedirect.com/science/article/pii/S0167739X23004752}, doi = {https://doi.org/10.1016/j.future.2023.12.021}, year = {2024}, date = {2024-01-29}, journal = {Future Generation Computer Systems}, volume = {154}, pages = {397-412}, abstract = {Several interrelated variables typically characterize real-world processes, and a time series cannot be predicted without considering the influence that other time series might have on the target time series. This work proposes a novel algorithm to forecast multivariate big data time series. This new general-purpose approach consists first of a previous pattern recognition performed jointly using all time series that form the multivariate time series and then predicts the target time series by searching for similarities between pattern sequences. The proposed algorithm is designed to tackle multivariate time series forecasting problems within the context of big data. In particular, the algorithm has been developed with a distributed nature to enhance its efficiency in analyzing and processing large volumes of data. Moreover, the algorithm is straightforward to use, with only two parameters needing adjustment. Another advantage of the MV-bigPSF algorithm is its ability to perform multi-step forecasting, which is particularly useful in many practical applications. To evaluate the algorithm’s performance, real-world data from Uruguay’s power consumption has been utilized. Specifically, MV-bigPSF has been compared with both univariate and multivariate methods. Regarding the univariate ones, MV-bigPSF improved 12.8% in MAPE compared to the second-best method. Regarding the multivariate comparison, MV-bigPSF improved 44.8% in MAPE with respect to the second most accurate method. Regarding efficiency, the execution time of MV-bigPSF was 1.83 times faster than the second-fastest multivariate method, both in a single-core environment. Therefore, the proposed algorithm can be a valuable tool for practitioners and researchers working in multivariate time series forecasting, particularly in big data applications.}, keywords = {big data, energy, forecasting, time series}, pubstate = {published}, tppubtype = {article} } Several interrelated variables typically characterize real-world processes, and a time series cannot be predicted without considering the influence that other time series might have on the target time series. This work proposes a novel algorithm to forecast multivariate big data time series. This new general-purpose approach consists first of a previous pattern recognition performed jointly using all time series that form the multivariate time series and then predicts the target time series by searching for similarities between pattern sequences. The proposed algorithm is designed to tackle multivariate time series forecasting problems within the context of big data. In particular, the algorithm has been developed with a distributed nature to enhance its efficiency in analyzing and processing large volumes of data. Moreover, the algorithm is straightforward to use, with only two parameters needing adjustment. Another advantage of the MV-bigPSF algorithm is its ability to perform multi-step forecasting, which is particularly useful in many practical applications. To evaluate the algorithm’s performance, real-world data from Uruguay’s power consumption has been utilized. Specifically, MV-bigPSF has been compared with both univariate and multivariate methods. Regarding the univariate ones, MV-bigPSF improved 12.8% in MAPE compared to the second-best method. Regarding the multivariate comparison, MV-bigPSF improved 44.8% in MAPE with respect to the second most accurate method. Regarding efficiency, the execution time of MV-bigPSF was 1.83 times faster than the second-fastest multivariate method, both in a single-core environment. Therefore, the proposed algorithm can be a valuable tool for practitioners and researchers working in multivariate time series forecasting, particularly in big data applications. |
F. Martínez-Álvarez and R. Scitovski and C. Rubio-Escudero and A. Morales-Esteban Emerging trends in big data analytics and natural disasters (Editorial) (Journal Article) Computers and Geosciences, 182 , pp. 105465, 2024. (Links | BibTeX | Tags: big data, natural disasters, time series) @article{MARTINEZ24, title = {Emerging trends in big data analytics and natural disasters (Editorial)}, author = {F. Martínez-Álvarez and R. Scitovski and C. Rubio-Escudero and A. Morales-Esteban}, url = {https://www.sciencedirect.com/science/article/pii/S0098300423001693}, doi = {https://doi.org/10.1016/j.cageo.2023.105465}, year = {2024}, date = {2024-01-01}, journal = {Computers and Geosciences}, volume = {182}, pages = {105465}, keywords = {big data, natural disasters, time series}, pubstate = {published}, tppubtype = {article} } |
2023 |
P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and F. Martínez-Álvarez and A. Troncoso and Á. Herrero and J. L. Calvo-Rolle and H. Quintián and E. Corchado Springer, 749 , 2023, ISBN: 978-3-031-42529-5. (Links | BibTeX | Tags: big data, clustering, deep learning, IoT) @proceedings{SOCO2023a, title = {Proceedings of the 18th International Conference on Soft Computing Models in Industrial and Environmental Applications (SOCO 2023) Salamanca, Spain, September 5-7, 2023, volume 1}, author = {P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and F. Martínez-Álvarez and A. Troncoso and Á. Herrero and J. L. Calvo-Rolle and H. Quintián and E. Corchado}, editor = {P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and F. Martínez-Álvarez and A. Troncoso and Á. Herrero and J. L. Calvo-Rolle and H. Quintián and E. Corchado}, url = {https://link.springer.com/book/10.1007/978-3-031-42529-5}, doi = {https://doi.org/10.1007/978-3-031-42529-5}, isbn = {978-3-031-42529-5}, year = {2023}, date = {2023-09-05}, volume = {749}, publisher = {Springer}, series = {Lecture Notes in Networks and Systems}, keywords = {big data, clustering, deep learning, IoT}, pubstate = {published}, tppubtype = {proceedings} } |
P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and F. Martínez-Álvarez and A. Troncoso and Á. Herrero and J. L. Calvo-Rolle and H. Quintián and E. Corchado Springer, 750 , 2023, ISBN: 978-3-031-42536-3. (Links | BibTeX | Tags: big data, clustering, deep learning, IoT) @proceedings{SOCO2023b, title = {Proceedings of the 18th International Conference on Soft Computing Models in Industrial and Environmental Applications (SOCO 2023) Salamanca, Spain, September 5-7, 2023, volume 2}, author = {P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and F. Martínez-Álvarez and A. Troncoso and Á. Herrero and J. L. Calvo-Rolle and H. Quintián and E. Corchado}, editor = {P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and F. Martínez-Álvarez and A. Troncoso and Á. Herrero and J. L. Calvo-Rolle and H. Quintián and E. Corchado}, url = {https://link.springer.com/book/10.1007/978-3-031-42536-3}, doi = {10.1007/978-3-030-20055-8}, isbn = {978-3-031-42536-3}, year = {2023}, date = {2023-09-05}, volume = {750}, publisher = {Springer}, series = {Lecture Notes in Networks and Systems}, keywords = {big data, clustering, deep learning, IoT}, pubstate = {published}, tppubtype = {proceedings} } |
P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and F. Martínez-Álvarez and A. Troncoso and Á. Herrero and J. L. Calvo-Rolle and H. Quintián and E. Corchado Springer, 14001 , 2023, ISBN: 978-3-031-40725-3. (Links | BibTeX | Tags: big data, clustering, deep learning, IoT) @proceedings{HAIS2023, title = {Proceedings of the 18th International Conference on Hybrid Artificial Intelligent Systems (HAIS 2023) Salamanca, Spain, September 5-7, 2023}, author = {P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and F. Martínez-Álvarez and A. Troncoso and Á. Herrero and J. L. Calvo-Rolle and H. Quintián and E. Corchado}, editor = {P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and F. Martínez-Álvarez and A. Troncoso and Á. Herrero and J. L. Calvo-Rolle and H. Quintián and E. Corchado}, url = {https://link.springer.com/book/10.1007/978-3-031-40725-3}, doi = {https://doi.org/10.1007/978-3-031-40725-3}, isbn = {978-3-031-40725-3}, year = {2023}, date = {2023-09-05}, volume = {14001}, publisher = {Springer}, series = {Lecture Notes in Artificial Intelligence}, keywords = {big data, clustering, deep learning, IoT}, pubstate = {published}, tppubtype = {proceedings} } |
P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and F. Martínez-Álvarez and A. Troncoso and Á. Herrero and J. L. Calvo-Rolle and H. Quintián and E. Corchado Springer, 748 , 2023, ISBN: 978-3-031-42519-6. (Links | BibTeX | Tags: big data, clustering) @proceedings{CISIS-ICEUTE2023, title = {Proceedings of the International Joint Conference 16th International Conference on Computational Intelligence in Security for Information Systems (CISIS 2023) 14th International Conference on EUropean Transnational Education (ICEUTE 2023). Salamanca, Spain, September 5-7, 2023}, author = {P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and F. Martínez-Álvarez and A. Troncoso and Á. Herrero and J. L. Calvo-Rolle and H. Quintián and E. Corchado}, editor = {P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and F. Martínez-Álvarez and A. Troncoso and Á. Herrero and J. L. Calvo-Rolle and H. Quintián and E. Corchado}, url = {https://link.springer.com/book/10.1007/978-3-031-42519-6}, doi = {https://doi.org/10.1007/978-3-031-42519-6}, isbn = {978-3-031-42519-6}, year = {2023}, date = {2023-09-05}, volume = {748}, publisher = {Springer}, series = {Lecture Notes in Networks and Systems}, keywords = {big data, clustering}, pubstate = {published}, tppubtype = {proceedings} } |
A. M. Fernández and D. Gutiérrez-Avilés and A. Troncoso and F. Martínez-Álvarez A new Apache Spark-based framework for big data streaming forecasting in IoT networks (Journal Article) Journal of Supercomputing, 79 , pp. 11078–11100, 2023. (Abstract | Links | BibTeX | Tags: big data, IoT) @article{FERNANDEZ23, title = {A new Apache Spark-based framework for big data streaming forecasting in IoT networks}, author = {A. M. Fernández and D. Gutiérrez-Avilés and A. Troncoso and F. Martínez-Álvarez}, url = {https://link.springer.com/article/10.1007/s11227-023-05100-x}, doi = {https://doi.org/10.1007/s11227-023-05100-x}, year = {2023}, date = {2023-02-02}, journal = {Journal of Supercomputing}, volume = {79}, pages = {11078–11100}, abstract = {Analyzing time-dependent data acquired in a continuous flow is a major challenge for various fields, such as big data and machine learning. Being able to analyze a large volume of data from various sources, such as sensors, networks, and the internet, is essential for improving the efficiency of our society's production processes. Additionally, this vast amount of data is collected dynamically in a continuous stream. The goal of this research is to provide a comprehensive framework for forecasting big data streams from Internet of Things networks and serve as a guide for designing and deploying other third-party solutions. Hence, a new framework for time series forecasting in a big data streaming scenario, using data collected from Internet of Things networks, is presented. This framework comprises of five main modules: Internet of Things network design and deployment, big data streaming architecture, stream data modeling method, big data forecasting method, and a comprehensive real-world application scenario, consisting of a physical Internet of Things network feeding the big data streaming architecture, being the linear regression the algorithm used for illustrative purposes. Comparison with other frameworks reveals that this is the first framework that incorporates and integrates all the aforementioned modules.}, keywords = {big data, IoT}, pubstate = {published}, tppubtype = {article} } Analyzing time-dependent data acquired in a continuous flow is a major challenge for various fields, such as big data and machine learning. Being able to analyze a large volume of data from various sources, such as sensors, networks, and the internet, is essential for improving the efficiency of our society's production processes. Additionally, this vast amount of data is collected dynamically in a continuous stream. The goal of this research is to provide a comprehensive framework for forecasting big data streams from Internet of Things networks and serve as a guide for designing and deploying other third-party solutions. Hence, a new framework for time series forecasting in a big data streaming scenario, using data collected from Internet of Things networks, is presented. This framework comprises of five main modules: Internet of Things network design and deployment, big data streaming architecture, stream data modeling method, big data forecasting method, and a comprehensive real-world application scenario, consisting of a physical Internet of Things network feeding the big data streaming architecture, being the linear regression the algorithm used for illustrative purposes. Comparison with other frameworks reveals that this is the first framework that incorporates and integrates all the aforementioned modules. |
M. García-Torres and R. Ruiz and F. Divina Evolutionary feature selection on high dimensional data using a search space reduction approach (Journal Article) Engineering Applications of Artificial Intelligence, 117 , pp. 105556, 2023. (Abstract | Links | BibTeX | Tags: big data, feature selection) @article{garcia2023evolutionary, title = {Evolutionary feature selection on high dimensional data using a search space reduction approach}, author = {M. García-Torres and R. Ruiz and F. Divina}, url = {https://www.sciencedirect.com/science/article/pii/S0952197622005462}, doi = {10.1016/j.engappai.2022.105556}, year = {2023}, date = {2023-01-01}, journal = {Engineering Applications of Artificial Intelligence}, volume = {117}, pages = {105556}, publisher = {Elsevier}, abstract = {Feature selection is becoming more and more a challenging task due to the increase of the dimensionality of the data. The complexity of the interactions among features and the size of the search space make it unfeasible to find the optimal subset of features. In order to reduce the search space, feature grouping has arisen as an approach that allows to cluster feature according to the shared information about the class. On the other hand, metaheuristic algorithms have proven to achieve sub-optimal solutions within a reasonable time. In this work we propose a Scatter Search (SS) strategy that uses feature grouping to generate an initial population comprised of diverse and high quality solutions. Solutions are then evolved by applying random mechanisms in combination with the feature group structure, with the objective of maintaining during the search a population of good and, at the same time, as diverse as possible solutions. Not only does the proposed strategy provide the best subset of features found but it also reduces the redundancy structure of the data. We test the strategy on high dimensional data from biomedical and text-mining domains. The results are compared with those obtained by other adaptations of SS and other popular strategies. Results show that the proposed strategy can find, on average, the smallest subsets of features without degrading the performance of the classifier.}, keywords = {big data, feature selection}, pubstate = {published}, tppubtype = {article} } Feature selection is becoming more and more a challenging task due to the increase of the dimensionality of the data. The complexity of the interactions among features and the size of the search space make it unfeasible to find the optimal subset of features. In order to reduce the search space, feature grouping has arisen as an approach that allows to cluster feature according to the shared information about the class. On the other hand, metaheuristic algorithms have proven to achieve sub-optimal solutions within a reasonable time. In this work we propose a Scatter Search (SS) strategy that uses feature grouping to generate an initial population comprised of diverse and high quality solutions. Solutions are then evolved by applying random mechanisms in combination with the feature group structure, with the objective of maintaining during the search a population of good and, at the same time, as diverse as possible solutions. Not only does the proposed strategy provide the best subset of features found but it also reduces the redundancy structure of the data. We test the strategy on high dimensional data from biomedical and text-mining domains. The results are compared with those obtained by other adaptations of SS and other popular strategies. Results show that the proposed strategy can find, on average, the smallest subsets of features without degrading the performance of the classifier. |
2022 |
P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and J. R. Villar-Flecha and A. Troncoso and E. A. de la Cal and Á. Herrero and F. Martínez-Álvarez and G. Psaila and H. Quintián and E. Corchado Springer, 13469 , 2022, ISBN: 978-3-031-15470-6. (Links | BibTeX | Tags: big data, clustering, deep learning, IoT) @proceedings{HAIS2022, title = {Proceedings of the 17th International Conference on Hybrid Artificial Intelligent Systems (HAIS 2022) Salamanca, Spain, September 5-7, 2022}, author = {P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and J. R. Villar-Flecha and A. Troncoso and E. A. de la Cal and Á. Herrero and F. Martínez-Álvarez and G. Psaila and H. Quintián and E. Corchado}, url = {https://link.springer.com/book/10.1007/978-3-031-15471-3}, doi = {https://doi.org/10.1007/978-3-031-15471-3}, isbn = {978-3-031-15470-6}, year = {2022}, date = {2022-09-05}, volume = {13469}, publisher = {Springer}, series = {Lecture Notes in Artificial Intelligence}, keywords = {big data, clustering, deep learning, IoT}, pubstate = {published}, tppubtype = {proceedings} } |
P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and J. R. Villar-Flecha and A. Troncoso and E. A. de la Cal and Á. Herrero and F. Martínez-Álvarez and G. Psaila and H. Quintián and E. Corchado Springer, 531 , 2022, ISBN: 978-3-031-18050-7. (Links | BibTeX | Tags: big data, clustering, deep learning, IoT) @proceedings{SOCO2022, title = {Proceedings of the 17th International Conference on Soft Computing Models in Industrial and Environmental Applications (SOCO 2022) Salamanca, Spain, September 5-7, 2022}, author = {P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and J. R. Villar-Flecha and A. Troncoso and E. A. de la Cal and Á. Herrero and F. Martínez-Álvarez and G. Psaila and H. Quintián and E. Corchado}, url = {https://link.springer.com/book/10.1007/978-3-031-18050-7}, doi = {https://doi.org/10.1007/978-3-031-18050-7}, isbn = {978-3-031-18050-7}, year = {2022}, date = {2022-09-05}, volume = {531}, publisher = {Springer}, series = {Lecture Notes in Networks and Systems}, keywords = {big data, clustering, deep learning, IoT}, pubstate = {published}, tppubtype = {proceedings} } |
P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and J. R. Villar-Flecha and A. Troncoso and E. A. de la Cal and Á. Herrero and F. Martínez-Álvarez and G. Psaila and H. Quintián and E. Corchado Springer, 532 , 2022, ISBN: 978-3-031-18409-3. (Links | BibTeX | Tags: big data, deep learning) @proceedings{CISIS-ICEUTE2022, title = {Proceedings of the International Joint Conference 15th International Conference on Computational Intelligence in Security for Information Systems (CISIS 2022) 13th International Conference on EUropean Transnational Education (ICEUTE 2022). Salamanca, Spain, September 5-7, 2022}, author = {P. García-Bringas and H. Pérez-García and F. J. Martínez de Pisón and J. R. Villar-Flecha and A. Troncoso and E. A. de la Cal and Á. Herrero and F. Martínez-Álvarez and G. Psaila and H. Quintián and E. Corchado}, url = {https://link.springer.com/book/10.1007/978-3-031-18409-3}, doi = {https://doi.org/10.1007/978-3-031-18409-3}, isbn = {978-3-031-18409-3}, year = {2022}, date = {2022-09-05}, volume = {532}, publisher = {Springer}, series = {Lecture Notes in Networks and Systems}, keywords = {big data, deep learning}, pubstate = {published}, tppubtype = {proceedings} } |
L. Melgar-García and D. Gutiérrez-Avilés and M. T. Godinho and R. Espada and I. S. Brito and F. Martínez-Álvarez and A. Troncoso and C. Rubio-Escudero A new big data triclustering approach for extracting three-dimensional patterns in precision agriculture (Journal Article) Neurocomputing, 500 , pp. 268-278, 2022. (Abstract | Links | BibTeX | Tags: big data, pattern recognition) @article{MELGAR21_NEUCOMb, title = {A new big data triclustering approach for extracting three-dimensional patterns in precision agriculture}, author = {L. Melgar-García and D. Gutiérrez-Avilés and M. T. Godinho and R. Espada and I. S. Brito and F. Martínez-Álvarez and A. Troncoso and C. Rubio-Escudero}, url = {https://www.sciencedirect.com/science/article/abs/pii/S0925231222006415}, doi = {https://doi.org/10.1016/j.neucom.2021.06.101}, year = {2022}, date = {2022-01-01}, journal = {Neurocomputing}, volume = {500}, pages = {268-278}, abstract = {Precision agriculture focuses on the development of site-specific harvest considering the variability of each crop area. Vegetation indices allow the study and delineation of different characteristics of each field zone, generally invisible to the naked-eye. This paper introduces a new big data triclustering approach based on evolutionary algorithms. The algorithm shows its capability to discover three-dimensional patterns on the basis of vegetation indices from vine crops. Different vegetation indices have been tested to find different patterns in the crops. The results reported using a vineyard crop located in Portugal depicts four areas with different moisture stress particularities that can lead to changes in the management of the vineyard. Furthermore, scalability studies have been performed, showing that the proposed algorithm is suitable for dealing with big datasets.}, keywords = {big data, pattern recognition}, pubstate = {published}, tppubtype = {article} } Precision agriculture focuses on the development of site-specific harvest considering the variability of each crop area. Vegetation indices allow the study and delineation of different characteristics of each field zone, generally invisible to the naked-eye. This paper introduces a new big data triclustering approach based on evolutionary algorithms. The algorithm shows its capability to discover three-dimensional patterns on the basis of vegetation indices from vine crops. Different vegetation indices have been tested to find different patterns in the crops. The results reported using a vineyard crop located in Portugal depicts four areas with different moisture stress particularities that can lead to changes in the management of the vineyard. Furthermore, scalability studies have been performed, showing that the proposed algorithm is suitable for dealing with big datasets. |
J. A. Gallardo-Gómez and F. Divina and A. Troncoso and F. Martínez-Álvarez Explainable Artificial Intelligence for the Electric Vehicle Load Demand Forecasting Problem (Conference) SOCO 17th International Conference on Soft Computing Models in Industrial and Environmental Applications, Advances in Intelligent Systems and Computing 2022. (Links | BibTeX | Tags: big data, energy, time series) @conference{gallardo2022explainable, title = {Explainable Artificial Intelligence for the Electric Vehicle Load Demand Forecasting Problem}, author = {J. A. Gallardo-Gómez and F. Divina and A. Troncoso and F. Martínez-Álvarez }, url = {https://link.springer.com/chapter/10.1007/978-3-030-87869-6_65}, year = {2022}, date = {2022-01-01}, booktitle = {SOCO 17th International Conference on Soft Computing Models in Industrial and Environmental Applications}, pages = {413-422}, series = { Advances in Intelligent Systems and Computing}, keywords = {big data, energy, time series}, pubstate = {published}, tppubtype = {conference} } |
F. Morales and M. García-Torres and G. Velázquez and F. Daumas-Ladouce and P. Gardel-Sotomayor and F. Gómez-Vela and F. Divina and J. L. Vázquez Noguera and C. Sauer Ayala and D. Pinto-Roa Analysis of Electric Energy Consumption Profiles Using a Machine Learning Approach: A Paraguayan Case Study (Journal Article) Electronics, 11 (2), pp. 267, 2022. (Abstract | Links | BibTeX | Tags: big data, energy, time series) @article{morales2022analysisb, title = {Analysis of Electric Energy Consumption Profiles Using a Machine Learning Approach: A Paraguayan Case Study}, author = {F. Morales and M. García-Torres and G. Velázquez and F. Daumas-Ladouce and P. Gardel-Sotomayor and F. Gómez-Vela and F. Divina and J. L. Vázquez Noguera and C. Sauer Ayala and D. Pinto-Roa}, url = {https://www.mdpi.com/2079-9292/11/2/267}, doi = {10.3390/electronics11020267}, year = {2022}, date = {2022-01-01}, journal = {Electronics}, volume = {11}, number = {2}, pages = {267}, abstract = {Correctly defining and grouping electrical feeders is of great importance for electrical system operators. In this paper, we compare two different clustering techniques, K-means and hierarchical agglomerative clustering, applied to real data from the east region of Paraguay. The raw data were pre-processed, resulting in four data sets, namely, (i) a weekly feeder demand, (ii) a monthly feeder demand, (iii) a statistical feature set extracted from the original data and (iv) a seasonal and daily consumption feature set obtained considering the characteristics of the Paraguayan load curve. Considering the four data sets, two clustering algorithms, two distance metrics and five linkage criteria a total of 36 models with the Silhouette, Davies–Bouldin and Calinski–Harabasz index scores was assessed. The K-means algorithms with the seasonal feature data sets showed the best performance considering the Silhouette, Calinski–Harabasz and Davies–Bouldin validation index scores with a configuration of six clusters.}, keywords = {big data, energy, time series}, pubstate = {published}, tppubtype = {article} } Correctly defining and grouping electrical feeders is of great importance for electrical system operators. In this paper, we compare two different clustering techniques, K-means and hierarchical agglomerative clustering, applied to real data from the east region of Paraguay. The raw data were pre-processed, resulting in four data sets, namely, (i) a weekly feeder demand, (ii) a monthly feeder demand, (iii) a statistical feature set extracted from the original data and (iv) a seasonal and daily consumption feature set obtained considering the characteristics of the Paraguayan load curve. Considering the four data sets, two clustering algorithms, two distance metrics and five linkage criteria a total of 36 models with the Silhouette, Davies–Bouldin and Calinski–Harabasz index scores was assessed. The K-means algorithms with the seasonal feature data sets showed the best performance considering the Silhouette, Calinski–Harabasz and Davies–Bouldin validation index scores with a configuration of six clusters. |
2021 |
M. García-Torres and F. Gómez-Vela and F. Divina and D.P. Pinto-Roa and J.L. Vázquez Noguera and J.C. Román Scatter search for high-dimensional feature selection using feature grouping (Conference) GECCO Genetic and Evolutionary Computation Conference, 2021. (Links | BibTeX | Tags: big data, feature selection, pattern recognition) @conference{garcia2021scatter, title = {Scatter search for high-dimensional feature selection using feature grouping}, author = {M. García-Torres and F. Gómez-Vela and F. Divina and D.P. Pinto-Roa and J.L. Vázquez Noguera and J.C. Román}, doi = {10.1145/3449726.3459481 pages=149--150}, year = {2021}, date = {2021-07-01}, booktitle = {GECCO Genetic and Evolutionary Computation Conference}, keywords = {big data, feature selection, pattern recognition}, pubstate = {published}, tppubtype = {conference} } |
J. F. Torres and D. Hadjout and A. Sebaa and F. Martínez-Álvarez and A. Troncoso Deep Learning for Time Series Forecasting: A Survey (Journal Article) Big Data, 9 (1), pp. 3-21, 2021. (Abstract | Links | BibTeX | Tags: big data, deep learning, time series) @article{TORRES21, title = {Deep Learning for Time Series Forecasting: A Survey}, author = {J. F. Torres and D. Hadjout and A. Sebaa and F. Martínez-Álvarez and A. Troncoso}, url = {https://www.liebertpub.com/doi/10.1089/big.2020.0159}, doi = {10.1089/big.2020.0159}, year = {2021}, date = {2021-02-05}, journal = {Big Data}, volume = {9}, number = {1}, pages = {3-21}, abstract = {Deep learning, one of the most remarkable techniques of machine learning, has been a major success in many fields, including image processing, speech recognition, and text understanding. It is powerful engines capable of learning arbitrary mapping functions, not require a scaled or stationary time series as input, support multivariate inputs, and support multi-step outputs. All of these features together make deep learning useful tools when dealing with more complex time series prediction problems involving large amounts of data, and multiple variables with complex relationships. This paper provides an overview of the most common Deep Learning types for time series forecasting, Explain the relationships between deep learning models and classical approaches to time series forecasting. A brief background of the particular challenges presents in time-series data and the most common deep learning techniques that are often used for time series forecasting is provided. Previous studies that applied deep learning to time series are reviewed.}, keywords = {big data, deep learning, time series}, pubstate = {published}, tppubtype = {article} } Deep learning, one of the most remarkable techniques of machine learning, has been a major success in many fields, including image processing, speech recognition, and text understanding. It is powerful engines capable of learning arbitrary mapping functions, not require a scaled or stationary time series as input, support multivariate inputs, and support multi-step outputs. All of these features together make deep learning useful tools when dealing with more complex time series prediction problems involving large amounts of data, and multiple variables with complex relationships. This paper provides an overview of the most common Deep Learning types for time series forecasting, Explain the relationships between deep learning models and classical approaches to time series forecasting. A brief background of the particular challenges presents in time-series data and the most common deep learning techniques that are often used for time series forecasting is provided. Previous studies that applied deep learning to time series are reviewed. |
L. Melgar-García and D. Gutiérrez-Avilés and C. Rubio-Escudero and A. Troncoso Discovering three-dimensional patterns in real-time from data streams: An online triclustering approach (Journal Article) Information Sciences, 558 , pp. 174-193, 2021. (Abstract | Links | BibTeX | Tags: big data, IoT, pattern recognition) @article{Melgar21_IS, title = {Discovering three-dimensional patterns in real-time from data streams: An online triclustering approach}, author = {L. Melgar-García and D. Gutiérrez-Avilés and C. Rubio-Escudero and A. Troncoso}, url = {https://www.sciencedirect.com/science/article/pii/S0020025521000220}, doi = {10.1016/j.ins.2020.12.089}, year = {2021}, date = {2021-01-01}, journal = {Information Sciences}, volume = {558}, pages = {174-193}, abstract = {Triclustering algorithms group sets of coordinates of 3-dimensional datasets. In this paper, a new triclustering approach for data streams is introduced. It follows a streaming scheme of learning in two steps: offline and online phases. First, the offline phase provides a summary model with the components of the triclusters. Then, the second stage is the online phase to deal with data in streaming. This online phase consists in using the summary model obtained in the offline stage to update the triclusters as fast as possible with genetic operators. Results using three types of synthetic datasets and a real-world environmental sensor dataset are reported. The performance of the proposed triclustering streaming algorithm is compared to a batch triclustering algorithm, showing an accurate performance both in terms of quality and running times. }, keywords = {big data, IoT, pattern recognition}, pubstate = {published}, tppubtype = {article} } Triclustering algorithms group sets of coordinates of 3-dimensional datasets. In this paper, a new triclustering approach for data streams is introduced. It follows a streaming scheme of learning in two steps: offline and online phases. First, the offline phase provides a summary model with the components of the triclusters. Then, the second stage is the online phase to deal with data in streaming. This online phase consists in using the summary model obtained in the offline stage to update the triclusters as fast as possible with genetic operators. Results using three types of synthetic datasets and a real-world environmental sensor dataset are reported. The performance of the proposed triclustering streaming algorithm is compared to a batch triclustering algorithm, showing an accurate performance both in terms of quality and running times. |
J. A. Gallardo and M. García-Torres and F. Gómez-Vela and F. Morales and F. Divina and D. Becerra-Alonso and G. Velázquez and F. Daumas-Ladouce and J. L. Vázquez Noguera and C. Ayala Sauer Forecasting Electricity Consumption Data from Paraguay Using a Machine Learning Approach (Conference) SOCO 16th International Conference on Soft Computing Models in Industrial and Environmental Applications, 1401 , Advances in Intelligent Systems and Computing 2021. (Links | BibTeX | Tags: big data, energy, time series) @conference{gallardo2022forecasting, title = {Forecasting Electricity Consumption Data from Paraguay Using a Machine Learning Approach}, author = {J. A. Gallardo and M. García-Torres and F. Gómez-Vela and F. Morales and F. Divina and D. Becerra-Alonso and G. Velázquez and F. Daumas-Ladouce and J. L. Vázquez Noguera and C. Ayala Sauer}, url = {https://link.springer.com/chapter/10.1007/978-3-030-87869-6_65}, year = {2021}, date = {2021-01-01}, booktitle = {SOCO 16th International Conference on Soft Computing Models in Industrial and Environmental Applications}, volume = {1401}, pages = {685-694}, series = {Advances in Intelligent Systems and Computing}, keywords = {big data, energy, time series}, pubstate = {published}, tppubtype = {conference} } |
2020 |
P. Jiménez-Herrera and L. Melgar-García and G. Asencio-Cortés and A. Troncoso A New Forecasting Algorithm Based on Neighbors for Streaming Electricity Time Series (Conference) HAIS 15th International Conference on Hybrid Artificial Intelligence Systems, Lecture Notes in Computer Science 2020. (Links | BibTeX | Tags: big data, energy, IoT, time series) @conference{HAIS2020, title = {A New Forecasting Algorithm Based on Neighbors for Streaming Electricity Time Series}, author = {P. Jiménez-Herrera and L. Melgar-García and G. Asencio-Cortés and A. Troncoso}, url = {https://link.springer.com/chapter/10.1007/978-3-030-61705-9_43}, year = {2020}, date = {2020-11-04}, booktitle = {HAIS 15th International Conference on Hybrid Artificial Intelligence Systems}, pages = {522-533}, series = {Lecture Notes in Computer Science}, keywords = {big data, energy, IoT, time series}, pubstate = {published}, tppubtype = {conference} } |
F. Divina and J. F. Torres and M. García-Torres and F. Martínez-Álvarez and A. Troncoso Hybridizing deep learning and neuroevolution: Application to the Spanish short-term electric energy consumption forecasting (Journal Article) Applied Sciences, 10 (16), pp. 5487, 2020. (Abstract | Links | BibTeX | Tags: big data, deep learning, energy, time series) @article{DIVINA2020, title = {Hybridizing deep learning and neuroevolution: Application to the Spanish short-term electric energy consumption forecasting}, author = {F. Divina and J. F. Torres and M. García-Torres and F. Martínez-Álvarez and A. Troncoso}, url = {https://www.mdpi.com/2076-3417/10/16/5487}, doi = {https://doi.org/10.3390/app10165487}, year = {2020}, date = {2020-07-30}, journal = {Applied Sciences}, volume = {10}, number = {16}, pages = {5487}, abstract = {The electric energy production would be much more efficient if accurate estimations of the future demand were available, since these would allow allocating only the resources needed for the production of the right amount of energy required. With this motivation in mind, we propose a strategy, based on neuroevolution, that can be used to this aim. Our proposal uses a genetic algorithm in order to find a sub-optimal set of hyper-parameters for configuring a deep neural network, which can then be used for obtaining the forecasting. Such a strategy is justified by the observation that the performances achieved by deep neural networks are strongly dependent on the right setting of the hyper-parameters, and genetic algorithms have shown excellent search capabilities in huge search spaces. Moreover, we base our proposal on a distributed computing platform, which allows its use on a large time-series. In order to assess the performances of our approach, we have applied it to a large dataset, related to the electric energy consumption registered in Spain over almost 10 years. Experimental results confirm the validity of our proposal since it outperforms all other forecasting techniques to which it has been compared.}, keywords = {big data, deep learning, energy, time series}, pubstate = {published}, tppubtype = {article} } The electric energy production would be much more efficient if accurate estimations of the future demand were available, since these would allow allocating only the resources needed for the production of the right amount of energy required. With this motivation in mind, we propose a strategy, based on neuroevolution, that can be used to this aim. Our proposal uses a genetic algorithm in order to find a sub-optimal set of hyper-parameters for configuring a deep neural network, which can then be used for obtaining the forecasting. Such a strategy is justified by the observation that the performances achieved by deep neural networks are strongly dependent on the right setting of the hyper-parameters, and genetic algorithms have shown excellent search capabilities in huge search spaces. Moreover, we base our proposal on a distributed computing platform, which allows its use on a large time-series. In order to assess the performances of our approach, we have applied it to a large dataset, related to the electric energy consumption registered in Spain over almost 10 years. Experimental results confirm the validity of our proposal since it outperforms all other forecasting techniques to which it has been compared. |
F. Martínez-Álvarez and G. Asencio-Cortés and J. F. Torres and D. Gutiérrez-Avilés and L. Melgar-García and R. Pérez-Chacón and C. Rubio-Escudero and A. Troncoso and J. C. Riquelme Coronavirus Optimization Algorithm: A bioinspired metaheuristic based on the COVID-19 propagation model (Journal Article) Big Data, 8 (4), pp. 308-322, 2020. (Abstract | Links | BibTeX | Tags: big data, deep learning, energy, time series) @article{MARTINEZ-ALVAREZ20, title = {Coronavirus Optimization Algorithm: A bioinspired metaheuristic based on the COVID-19 propagation model}, author = {F. Martínez-Álvarez and G. Asencio-Cortés and J. F. Torres and D. Gutiérrez-Avilés and L. Melgar-García and R. Pérez-Chacón and C. Rubio-Escudero and A. Troncoso and J. C. Riquelme}, url = {https://www.liebertpub.com/doi/full/10.1089/big.2020.0051}, doi = {10.1089/big.2020.0051}, year = {2020}, date = {2020-07-22}, journal = {Big Data}, volume = {8}, number = {4}, pages = {308-322}, abstract = {This work proposes a novel bioinspired metaheuristic, simulating how the coronavirus spreads and infects healthy people. From a primary infected individual (patient zero), the coronavirus rapidly infects new victims, creating large populations of infected people who will either die or spread infection. Relevant terms such as reinfection probability, super-spreading rate, social distancing measures or traveling rate are introduced into the model in order to simulate the coronavirus activity as accurately as possible. The infected population initially grows exponentially over time, but taking into consideration social isolation measures, the mortality rate and number of recoveries, the infected population gradually decreases. The Coronavirus Optimization Algorithm has two major advantages when compared to other similar strategies. Firstly, the input parameters are already set according to the disease statistics, preventing researchers from initializing them with arbitrary values. Secondly, the approach has the ability to end after several iterations, without setting this value either. Furthermore, a parallel multi-virus version is proposed, where several coronavirus strains evolve over time and explore wider search space areas in less iterations. Finally, the metaheuristic has been combined with deep learning models, in order to find optimal hyperparameters during the training phase. As application case, the problem of electricity load time series forecasting has been addressed, showing quite remarkable performance.}, keywords = {big data, deep learning, energy, time series}, pubstate = {published}, tppubtype = {article} } This work proposes a novel bioinspired metaheuristic, simulating how the coronavirus spreads and infects healthy people. From a primary infected individual (patient zero), the coronavirus rapidly infects new victims, creating large populations of infected people who will either die or spread infection. Relevant terms such as reinfection probability, super-spreading rate, social distancing measures or traveling rate are introduced into the model in order to simulate the coronavirus activity as accurately as possible. The infected population initially grows exponentially over time, but taking into consideration social isolation measures, the mortality rate and number of recoveries, the infected population gradually decreases. The Coronavirus Optimization Algorithm has two major advantages when compared to other similar strategies. Firstly, the input parameters are already set according to the disease statistics, preventing researchers from initializing them with arbitrary values. Secondly, the approach has the ability to end after several iterations, without setting this value either. Furthermore, a parallel multi-virus version is proposed, where several coronavirus strains evolve over time and explore wider search space areas in less iterations. Finally, the metaheuristic has been combined with deep learning models, in order to find optimal hyperparameters during the training phase. As application case, the problem of electricity load time series forecasting has been addressed, showing quite remarkable performance. |
R. Pérez-Chacón and G. Asencio-Cortés and F. Martínez-Álvarez and A. Troncoso Big data time series forecasting based on pattern sequence similarity and its application to the electricity demand (Journal Article) Information Sciences, 540 , pp. 160-174, 2020. (Abstract | Links | BibTeX | Tags: big data, energy, time series) @article{PEREZ20, title = {Big data time series forecasting based on pattern sequence similarity and its application to the electricity demand}, author = {R. Pérez-Chacón and G. Asencio-Cortés and F. Martínez-Álvarez and A. Troncoso}, url = {https://www.sciencedirect.com/science/article/pii/S0020025520306010}, doi = {10.1016/j.ins.2020.06.014}, year = {2020}, date = {2020-06-06}, journal = {Information Sciences}, volume = {540}, pages = {160-174}, abstract = {This work proposes a novel algorithm to forecast big data time series. Based on the well-established Pattern Sequence Forecasting algorithm, this new approach has two major contributions to the literature. First, the improvement of the aforementioned algorithm with respect to the accuracy of predictions, and second, its transformation into the big data context, having reached meaningful results in terms of scalability. The algorithm uses the Apache Spark distributed computation framework and it is a ready-to-use application with few parameters to adjust. Physical and cloud clusters have been used to carry out the experimentation, which consisted in applying the algorithm to real-world data from Uruguay electricity demand.}, keywords = {big data, energy, time series}, pubstate = {published}, tppubtype = {article} } This work proposes a novel algorithm to forecast big data time series. Based on the well-established Pattern Sequence Forecasting algorithm, this new approach has two major contributions to the literature. First, the improvement of the aforementioned algorithm with respect to the accuracy of predictions, and second, its transformation into the big data context, having reached meaningful results in terms of scalability. The algorithm uses the Apache Spark distributed computation framework and it is a ready-to-use application with few parameters to adjust. Physical and cloud clusters have been used to carry out the experimentation, which consisted in applying the algorithm to real-world data from Uruguay electricity demand. |
A. M. Fernández and D. Gutiérrez-Avilés and A. Troncoso and F. Martínez-Álvarez Automated Deployment of a Spark Cluster with Machine Learning Algorithm Integration (Journal Article) Big Data Research, 19-20 , pp. 100135, 2020. (Abstract | Links | BibTeX | Tags: big data, time series) @article{FERNANDEZ20, title = {Automated Deployment of a Spark Cluster with Machine Learning Algorithm Integration}, author = {A. M. Fernández and D. Gutiérrez-Avilés and A. Troncoso and F. Martínez-Álvarez}, url = {https://www.sciencedirect.com/science/article/pii/S2214579620300034}, doi = {10.1016/j.bdr.2020.100135}, year = {2020}, date = {2020-05-12}, journal = {Big Data Research}, volume = {19-20}, pages = {100135}, abstract = {The vast amount of data stored nowadays has turned big data analytics into a very trendy research field. The Spark distributed computing platform has emerged as a dominant and widely used paradigm for cluster deployment and big data analytics. However, to get started up is still a task that may take much time when manually done, due to the requisites that all nodes must fulfill. This work introduces LadonSpark, an open-source and non-commercial solution to configure and deploy a Spark cluster automatically. It has been specially designed for easy and efficient management of a Spark cluster with a friendly graphical user interface to automate the deployment of a cluster and to start up the distributed file system of Hadoop quickly. Moreover, LadonSpark includes the functionality of integrating any algorithm into the system. That is, the user only needs to provide the executable file and the number of required inputs for proper parametrization. Source codes developed in Scala, R, Python, or Java can be supported on LadonSpark. Besides, clustering, regression, classification, and association rules algorithms are already integrated so that users can test its usability from its initial installation.}, keywords = {big data, time series}, pubstate = {published}, tppubtype = {article} } The vast amount of data stored nowadays has turned big data analytics into a very trendy research field. The Spark distributed computing platform has emerged as a dominant and widely used paradigm for cluster deployment and big data analytics. However, to get started up is still a task that may take much time when manually done, due to the requisites that all nodes must fulfill. This work introduces LadonSpark, an open-source and non-commercial solution to configure and deploy a Spark cluster automatically. It has been specially designed for easy and efficient management of a Spark cluster with a friendly graphical user interface to automate the deployment of a cluster and to start up the distributed file system of Hadoop quickly. Moreover, LadonSpark includes the functionality of integrating any algorithm into the system. That is, the user only needs to provide the executable file and the number of required inputs for proper parametrization. Source codes developed in Scala, R, Python, or Java can be supported on LadonSpark. Besides, clustering, regression, classification, and association rules algorithms are already integrated so that users can test its usability from its initial installation. |
F. Martínez-Álvarez and D. T. Bui Advanced Machine Learning and Big Data Analytics in Remote Sensing for Natural Hazards Management (Editorial) (Journal Article) Remote Sensing, 12 (2), pp. 301, 2020, ISSN: 2072-4292. (Abstract | Links | BibTeX | Tags: big data, natural disasters) @article{MARTINEZ20c, title = {Advanced Machine Learning and Big Data Analytics in Remote Sensing for Natural Hazards Management (Editorial)}, author = {F. Martínez-Álvarez and D. T. Bui}, url = {https://www.mdpi.com/2072-4292/12/2/301}, doi = {10.3390/rs12020301}, issn = {2072-4292}, year = {2020}, date = {2020-01-01}, journal = {Remote Sensing}, volume = {12}, number = {2}, pages = {301}, abstract = {This editorial summarizes the performance of the special issue entitled Advanced Machine Learning and Big Data Analytics in Remote Sensing for Natural Hazards Management, which was published at MDPI’s Remote Sensing journal. The special issue took place in years 2018 and 2019 and accepted a total of nine papers from authors of thirteen different countries. So far, these papers have dealt with 116 cites. Earthquakes, landslides, floods, wildfire and soil salinity were the topics analyzed. New methods were introduced, with applications of the utmost relevance}, keywords = {big data, natural disasters}, pubstate = {published}, tppubtype = {article} } This editorial summarizes the performance of the special issue entitled Advanced Machine Learning and Big Data Analytics in Remote Sensing for Natural Hazards Management, which was published at MDPI’s Remote Sensing journal. The special issue took place in years 2018 and 2019 and accepted a total of nine papers from authors of thirteen different countries. So far, these papers have dealt with 116 cites. Earthquakes, landslides, floods, wildfire and soil salinity were the topics analyzed. New methods were introduced, with applications of the utmost relevance |
2019 |
F. Martínez-Álvarez and A. Morales-Esteban Big data and natural disasters: New approaches for temporal and spatial massive data analysis (Editorial) (Journal Article) Computers and Geosciences, 129 , pp. 38-39, 2019. (Links | BibTeX | Tags: big data, natural disasters, time series) @article{MARTINEZ19, title = {Big data and natural disasters: New approaches for temporal and spatial massive data analysis (Editorial)}, author = {F. Martínez-Álvarez and A. Morales-Esteban}, url = {https://www.sciencedirect.com/science/article/pii/S009830041930411X?dgcid=rss_sd_all}, doi = {https://doi.org/10.1016/j.cageo.2019.04.012}, year = {2019}, date = {2019-08-01}, journal = {Computers and Geosciences}, volume = {129}, pages = {38-39}, keywords = {big data, natural disasters, time series}, pubstate = {published}, tppubtype = {article} } |
R. Talavera-Llames and R. Pérez-Chacón and A. Troncoso and F. Martínez-Álvarez MV-kWNN: A novel multivariate and multi-output weighted nearest neighbors algorithm for big data time series forecasting (Journal Article) Neurocomputing, 353 , pp. 56-73, 2019. (Abstract | Links | BibTeX | Tags: big data, energy, time series) @article{NEUCOM2019, title = {MV-kWNN: A novel multivariate and multi-output weighted nearest neighbors algorithm for big data time series forecasting}, author = {R. Talavera-Llames and R. Pérez-Chacón and A. Troncoso and F. Martínez-Álvarez}, url = {https://www.sciencedirect.com/science/article/pii/S0925231219303236?via%3Dihub}, doi = {10.1016/j.neucom.2018.07.092}, year = {2019}, date = {2019-01-01}, journal = {Neurocomputing}, volume = {353}, pages = {56-73}, abstract = {This paper introduces a novel algorithm for big data time series forecasting. Its main novelty lies in its ability to deal with multivariate data, i.e. to consider multiple time series simultaneously, in order to make multi-output predictions. Real-world processes are typically characterised by several interrelated variables, and the future occurrence of certain time series cannot be explained without understanding the influence that other time series might have on the target time series. One key issue in the context of the multivariate analysis is to determine a priori whether exogenous variables must be included in the model or not. To deal with this, a correlation analysis is used to find a minimum correlation threshold that an exogenous time series must exhibit, in order to be beneficial. Furthermore, the proposed approach has been specifically designed to be used in the context of big data, thus making it possible to efficiently process very large time series. To evaluate the performance of the proposed approach we use data from Spanish electricity prices. Results have been compared to other multivariate approaches showing remarkable improvements both in terms of accuracy and execution time.}, keywords = {big data, energy, time series}, pubstate = {published}, tppubtype = {article} } This paper introduces a novel algorithm for big data time series forecasting. Its main novelty lies in its ability to deal with multivariate data, i.e. to consider multiple time series simultaneously, in order to make multi-output predictions. Real-world processes are typically characterised by several interrelated variables, and the future occurrence of certain time series cannot be explained without understanding the influence that other time series might have on the target time series. One key issue in the context of the multivariate analysis is to determine a priori whether exogenous variables must be included in the model or not. To deal with this, a correlation analysis is used to find a minimum correlation threshold that an exogenous time series must exhibit, in order to be beneficial. Furthermore, the proposed approach has been specifically designed to be used in the context of big data, thus making it possible to efficiently process very large time series. To evaluate the performance of the proposed approach we use data from Spanish electricity prices. Results have been compared to other multivariate approaches showing remarkable improvements both in terms of accuracy and execution time. |
A. M. Fernández and D. Gutiérrez-Avilés and A. Troncoso and F. Martínez-Álvarez Real-Time Big Data Analytics in Smart Cities from LoRa-based IoT Networks (Conference) SOCO 14th International Conference on Soft Computing Models in Industrial and Environmental Applications, Advances in Intelligent Systems and Computing 2019. (Links | BibTeX | Tags: big data, IoT) @conference{SOCO2019, title = {Real-Time Big Data Analytics in Smart Cities from LoRa-based IoT Networks}, author = {A. M. Fernández and D. Gutiérrez-Avilés and A. Troncoso and F. Martínez-Álvarez}, url = {https://link.springer.com/chapter/10.1007/978-3-030-20055-8_9}, year = {2019}, date = {2019-01-01}, booktitle = {SOCO 14th International Conference on Soft Computing Models in Industrial and Environmental Applications}, series = {Advances in Intelligent Systems and Computing}, keywords = {big data, IoT}, pubstate = {published}, tppubtype = {conference} } |
A. Galicia and R. Talavera-Llames and A. Troncoso and I. Koprinska and F. Martínez-Álvarez Multi-step forecasting for big data time series based on ensemble learning (Journal Article) Knowledge Based-Systems, 163 , pp. 830-841, 2019. (Links | BibTeX | Tags: big data, time series) @article{GALICIA19, title = {Multi-step forecasting for big data time series based on ensemble learning}, author = {A. Galicia and R. Talavera-Llames and A. Troncoso and I. Koprinska and F. Martínez-Álvarez}, url = {https://www.sciencedirect.com/science/article/abs/pii/S0950705118304957}, doi = {https://doi.org/10.1016/j.knosys.2018.10.009}, year = {2019}, date = {2019-01-01}, journal = {Knowledge Based-Systems}, volume = {163}, pages = {830-841}, keywords = {big data, time series}, pubstate = {published}, tppubtype = {article} } |
2018 |
R. Pérez-Chacón and J. M. Luna and A. Troncoso and F. Martínez-Álvarez and J. C. Riquelme Big data analytics for discovering electricity consumption patterns in smart cities (Journal Article) Energies, 11 (3), pp. 683, 2018. (Abstract | Links | BibTeX | Tags: big data, energy, time series) @article{Energies2018, title = {Big data analytics for discovering electricity consumption patterns in smart cities}, author = {R. Pérez-Chacón and J. M. Luna and A. Troncoso and F. Martínez-Álvarez and J. C. Riquelme}, url = {http://www.mdpi.com/1996-1073/11/3/683 }, doi = {10.3390/en11030683 }, year = {2018}, date = {2018-01-01}, journal = {Energies}, volume = {11}, number = {3}, pages = {683}, abstract = {New technologies such as sensor networks have been incorporated into the management of buildings for organizations and cities. Sensor networks have led to an exponential increase in the volume of data available in recent years, which can be used to extract consumption patterns for the purposes of energy and monetary savings. For this reason, new approaches and strategies are needed to analyze information in big data environments. This paper proposes a methodology to extract electric energy consumption patterns in big data time series, so that very valuable conclusions can be made for managers and governments. The methodology is based on the study of four clustering validity indices in their parallelized versions along with the application of a clustering technique. In particular, this work uses a voting system to choose an optimal number of clusters from the results of the indices, as well as the application of the distributed version of the k-means algorithm included in Apache Spark’s Machine Learning Library. The results, using electricity consumption for the years 2011–2017 for eight buildings of a public university, are presented and discussed. In addition, the performance of the proposed methodology is evaluated using synthetic big data, which cab represent thousands of buildings in a smart city. Finally, policies derived from the patterns discovered are proposed to optimize energy usage across the university campus.}, keywords = {big data, energy, time series}, pubstate = {published}, tppubtype = {article} } New technologies such as sensor networks have been incorporated into the management of buildings for organizations and cities. Sensor networks have led to an exponential increase in the volume of data available in recent years, which can be used to extract consumption patterns for the purposes of energy and monetary savings. For this reason, new approaches and strategies are needed to analyze information in big data environments. This paper proposes a methodology to extract electric energy consumption patterns in big data time series, so that very valuable conclusions can be made for managers and governments. The methodology is based on the study of four clustering validity indices in their parallelized versions along with the application of a clustering technique. In particular, this work uses a voting system to choose an optimal number of clusters from the results of the indices, as well as the application of the distributed version of the k-means algorithm included in Apache Spark’s Machine Learning Library. The results, using electricity consumption for the years 2011–2017 for eight buildings of a public university, are presented and discussed. In addition, the performance of the proposed methodology is evaluated using synthetic big data, which cab represent thousands of buildings in a smart city. Finally, policies derived from the patterns discovered are proposed to optimize energy usage across the university campus. |
A. Galicia and J. F. Torres and F. Martínez-Álvarez and A. Troncoso A novel Spark-based multi-step forecasting algorithm for big data time series (Journal Article) Information Sciences, 467 , pp. 800-818, 2018. (Abstract | Links | BibTeX | Tags: big data, energy, time series) @article{INFSCI2018, title = {A novel Spark-based multi-step forecasting algorithm for big data time series}, author = {A. Galicia and J. F. Torres and F. Martínez-Álvarez and A. Troncoso}, url = {https://www.sciencedirect.com/science/article/pii/S0020025518304493}, doi = {10.1016/j.ins.2018.06.010}, year = {2018}, date = {2018-01-01}, journal = {Information Sciences}, volume = {467}, pages = {800-818}, abstract = {This paper presents different scalable methods for predicting big time series, namely time series with a high frequency measurement. Methods are also developed to deal with arbitrary prediction horizons. The Apache Spark framework is proposed for distributed computing in order to achieve the scalability of the methods. Prediction methods have been developed using Spark’s MLlib library for machine learning. Since the library does not support multivariate regression, the prediction problem is formulated as h prediction sub-problems, where h is the number of future values to predict, that is, the prediction horizon. Furthermore, different kinds of representative methods have been chosen, such as decision trees, two tree-based ensemble techniques (Gradient-Boosted and Random Forest) and a linear regression method as a reference method for comparisons. Finally, the methodology has been tested in a real time series of electrical demand in Spain, with a time interval of ten minutes between measurements.}, keywords = {big data, energy, time series}, pubstate = {published}, tppubtype = {article} } This paper presents different scalable methods for predicting big time series, namely time series with a high frequency measurement. Methods are also developed to deal with arbitrary prediction horizons. The Apache Spark framework is proposed for distributed computing in order to achieve the scalability of the methods. Prediction methods have been developed using Spark’s MLlib library for machine learning. Since the library does not support multivariate regression, the prediction problem is formulated as h prediction sub-problems, where h is the number of future values to predict, that is, the prediction horizon. Furthermore, different kinds of representative methods have been chosen, such as decision trees, two tree-based ensemble techniques (Gradient-Boosted and Random Forest) and a linear regression method as a reference method for comparisons. Finally, the methodology has been tested in a real time series of electrical demand in Spain, with a time interval of ten minutes between measurements. |
R. Talavera-Llames and R. Pérez-Chacón and A. Troncoso and F. Martínez-Álvarez Big data time series forecasting based on nearest neighbors distributed computing with Spark (Journal Article) Knowledge-Based Systems, 161 (1), pp. 12-25, 2018. (Abstract | Links | BibTeX | Tags: big data, energy, time series) @article{KNOSYS2018b, title = {Big data time series forecasting based on nearest neighbors distributed computing with Spark}, author = {R. Talavera-Llames and R. Pérez-Chacón and A. Troncoso and F. Martínez-Álvarez}, url = {https://www.sciencedirect.com/science/article/pii/S0950705118303770}, doi = {10.1016/j.knosys.2018.07.026}, year = {2018}, date = {2018-01-01}, journal = {Knowledge-Based Systems}, volume = {161}, number = {1}, pages = {12-25}, abstract = {A new approach for big data forecasting based on the k-weighted nearest neighbours algorithm is introduced in this work. Such an algorithm has been developed for distributed computing under the Apache Spark framework. Every phase of the algorithm is explained in this work, along with how the optimal values of the input parameters required for the algorithm are obtained. In order to test the developed algorithm, a Spanish energy consumption big data time series has been used. The accuracy of the prediction has been assessed showing remarkable results. Additionally, the optimal configuration of a Spark cluster has been discussed. Finally, a scalability analysis of the algorithm has been conducted leading to the conclusion that the proposed algorithm is highly suitable for big data environments.}, keywords = {big data, energy, time series}, pubstate = {published}, tppubtype = {article} } A new approach for big data forecasting based on the k-weighted nearest neighbours algorithm is introduced in this work. Such an algorithm has been developed for distributed computing under the Apache Spark framework. Every phase of the algorithm is explained in this work, along with how the optimal values of the input parameters required for the algorithm are obtained. In order to test the developed algorithm, a Spanish energy consumption big data time series has been used. The accuracy of the prediction has been assessed showing remarkable results. Additionally, the optimal configuration of a Spark cluster has been discussed. Finally, a scalability analysis of the algorithm has been conducted leading to the conclusion that the proposed algorithm is highly suitable for big data environments. |
D. Gutiérrez-Avilés and J. A. Fábregas and J. Tejedor and F. Martínez-Álvarez and A. Troncoso and J. C. Riquelme SmartFD: A real big data application for electrical fraud detection (Conference) HAIS 13th International Conference on Hybrid Artificial Intelligence Systems, Lecture Notes in Computer Science 2018. (Links | BibTeX | Tags: big data, energy) @conference{HAIS2018, title = {SmartFD: A real big data application for electrical fraud detection}, author = {D. Gutiérrez-Avilés and J. A. Fábregas and J. Tejedor and F. Martínez-Álvarez and A. Troncoso and J. C. Riquelme}, url = {https://link.springer.com/chapter/10.1007/978-3-319-92639-1_11}, year = {2018}, date = {2018-01-01}, booktitle = {HAIS 13th International Conference on Hybrid Artificial Intelligence Systems}, series = {Lecture Notes in Computer Science}, keywords = {big data, energy}, pubstate = {published}, tppubtype = {conference} } |
G. Asencio-Cortes and A. Morales-Esteban and X. Shang and F. Martinez-Alvarez Earthquake prediction in California using regression algorithms and cloud-based big data infrastructure (Journal Article) Computers and Geosciences, (115), pp. 198-210, 2018, ISSN: 0098-3004. (Abstract | Links | BibTeX | Tags: big data, natural disasters, time series) @article{Asencio-Cortes2018b, title = {Earthquake prediction in California using regression algorithms and cloud-based big data infrastructure}, author = {G. Asencio-Cortes and A. Morales-Esteban and X. Shang and F. Martinez-Alvarez}, doi = {10.1016/j.cageo.2017.10.011}, issn = {0098-3004}, year = {2018}, date = {2018-01-01}, journal = {Computers and Geosciences}, number = {115}, pages = {198-210}, abstract = {Earthquake magnitude prediction is a challenging problem that has been widely studied during the last decades. Statistical, geophysical and machine learning approaches can be found in literature, with no particularly satisfactory results. In recent years, powerful computational techniques to analyze big data have emerged, making possible the analysis of massive datasets. These new methods make use of physical resources like cloud based architectures. California is known for being one of the regions with highest seismic activity in the world and many data are available. In this work, the use of several regression algorithms combined with ensemble learning is explored in the context of big data (1 GB catalog is used), in order to predict earthquakes magnitude within the next seven days. Apache Spark framework, H2O library in R language and Amazon cloud infrastructure were been used, reporting very promising results.}, keywords = {big data, natural disasters, time series}, pubstate = {published}, tppubtype = {article} } Earthquake magnitude prediction is a challenging problem that has been widely studied during the last decades. Statistical, geophysical and machine learning approaches can be found in literature, with no particularly satisfactory results. In recent years, powerful computational techniques to analyze big data have emerged, making possible the analysis of massive datasets. These new methods make use of physical resources like cloud based architectures. California is known for being one of the regions with highest seismic activity in the world and many data are available. In this work, the use of several regression algorithms combined with ensemble learning is explored in the context of big data (1 GB catalog is used), in order to predict earthquakes magnitude within the next seven days. Apache Spark framework, H2O library in R language and Amazon cloud infrastructure were been used, reporting very promising results. |
F. Martínez-Álvarez and A. Troncoso and J. C. Riquelme Data Science and Big Data in Energy Forecasting (Journal Article) Energies, 11 (11), pp. 3224, 2018. (Links | BibTeX | Tags: big data, energy) @article{Martinez18, title = {Data Science and Big Data in Energy Forecasting}, author = {F. Martínez-Álvarez and A. Troncoso and J. C. Riquelme}, doi = {10.3390/en11113224}, year = {2018}, date = {2018-01-01}, journal = {Energies}, volume = {11}, number = {11}, pages = {3224}, keywords = {big data, energy}, pubstate = {published}, tppubtype = {article} } |
2017 |
A. Galicia and J. F. Torres and F. Martínez-Álvarez and A. Troncoso Scalable Forecasting Techniques Applied to Big Electricity Time Series (Conference) IWANN International Work-Conference on Artificial Neural Networks, Lecture Notes in Computer Science 2017. (Links | BibTeX | Tags: big data, energy, time series) @conference{IWANN2017, title = {Scalable Forecasting Techniques Applied to Big Electricity Time Series}, author = {A. Galicia and J. F. Torres and F. Martínez-Álvarez and A. Troncoso}, url = {https://link.springer.com/chapter/10.1007/978-3-319-59147-6_15}, year = {2017}, date = {2017-01-01}, booktitle = {IWANN International Work-Conference on Artificial Neural Networks}, series = {Lecture Notes in Computer Science}, keywords = {big data, energy, time series}, pubstate = {published}, tppubtype = {conference} } |
O. Luaces and J. Díez and A. Alonso-Betanzos and A. Troncoso and A. Bahamonde Content-based methods in peer assessment of open-response questions to grade students as authors and as graders (Journal Article) Knowledge-Based Systems, 117 , pp. 79-87, 2017. (Abstract | Links | BibTeX | Tags: big data) @article{KNOSYS2017, title = {Content-based methods in peer assessment of open-response questions to grade students as authors and as graders}, author = {O. Luaces and J. Díez and A. Alonso-Betanzos and A. Troncoso and A. Bahamonde}, url = {http://www.sciencedirect.com/science/article/pii/S0950705116301964}, doi = {10.1016/j.knosys.2016.06.024}, year = {2017}, date = {2017-01-01}, journal = {Knowledge-Based Systems}, volume = {117}, pages = {79-87}, abstract = {Massive Open Online Courses (MOOCs) use different types of assignments in order to evaluate student knowledge. Multiple-choice tests are particularly apt given the possibility for automatic assessment of large numbers of assignments. However, certain skills require open responses that cannot be assessed automatically yet their evaluation by instructors or teaching assistants is unfeasible given the large number of students. A potentially effective solution is peer assessment whereby students grade the answers of other students. However, to avoid bias due to inexperience, such grades must be filtered. We describe a factorization approach to grading, as a scalable method capable of dealing with very high volumes of data. Our method is also capable of representing open-response content using a vector space model of the answers. Since reliable peer assessment requires students to make coherent assessments, students can be motivated by their assessments reflecting not only their own answers but also their efforts as graders. The method described is able to tackle both these aspects simultaneously. Finally, for a real-world university setting in Spain, we compared grades obtained by our method and grades awarded by university instructors, with results indicating a notable improvement from using a content-based approach. There was no evidence that instructor grading would have led to more accurate grading outcomes than the assessment produced by our models.}, keywords = {big data}, pubstate = {published}, tppubtype = {article} } Massive Open Online Courses (MOOCs) use different types of assignments in order to evaluate student knowledge. Multiple-choice tests are particularly apt given the possibility for automatic assessment of large numbers of assignments. However, certain skills require open responses that cannot be assessed automatically yet their evaluation by instructors or teaching assistants is unfeasible given the large number of students. A potentially effective solution is peer assessment whereby students grade the answers of other students. However, to avoid bias due to inexperience, such grades must be filtered. We describe a factorization approach to grading, as a scalable method capable of dealing with very high volumes of data. Our method is also capable of representing open-response content using a vector space model of the answers. Since reliable peer assessment requires students to make coherent assessments, students can be motivated by their assessments reflecting not only their own answers but also their efforts as graders. The method described is able to tackle both these aspects simultaneously. Finally, for a real-world university setting in Spain, we compared grades obtained by our method and grades awarded by university instructors, with results indicating a notable improvement from using a content-based approach. There was no evidence that instructor grading would have led to more accurate grading outcomes than the assessment produced by our models. |
2016 |
R. Talavera-Llames and R. Pérez-Chacón and M. Martínez-Ballesteros and A. Troncoso and F. Martínez-Álvarez A Nearest Neighbours - Based Algorithm for Big Time Series Data Forecasting (Conference) HAIS 11th International Conference on Hybrid Artificial Intelligence Systems, Lecture Note in Computer Science 2016. (Links | BibTeX | Tags: big data, energy, time series) @conference{HAIS2016b, title = {A Nearest Neighbours - Based Algorithm for Big Time Series Data Forecasting}, author = {R. Talavera-Llames and R. Pérez-Chacón and M. Martínez-Ballesteros and A. Troncoso and F. Martínez-Álvarez}, url = {https://link.springer.com/chapter/10.1007/978-3-319-32034-2_15}, year = {2016}, date = {2016-01-01}, booktitle = {HAIS 11th International Conference on Hybrid Artificial Intelligence Systems}, series = {Lecture Note in Computer Science}, keywords = {big data, energy, time series}, pubstate = {published}, tppubtype = {conference} } |
R. Pérez-Chacón and R. Talavera-Llames and F. Martínez-Álvarez and A. Troncoso Finding Electric Energy Consumption Patterns in Big Time Series Data (Conference) DCAI 13th International Conference on Distributed Computing and Artificial Intelligence, Advances in Intelligent Systems and Computing 2016. (Links | BibTeX | Tags: big data, energy, time series) @conference{DCAI2016, title = {Finding Electric Energy Consumption Patterns in Big Time Series Data}, author = {R. Pérez-Chacón and R. Talavera-Llames and F. Martínez-Álvarez and A. Troncoso}, url = {https://link.springer.com/chapter/10.1007%2F978-3-319-40162-1_25}, year = {2016}, date = {2016-01-01}, booktitle = {DCAI 13th International Conference on Distributed Computing and Artificial Intelligence}, series = {Advances in Intelligent Systems and Computing}, keywords = {big data, energy, time series}, pubstate = {published}, tppubtype = {conference} } |
A. M. Fernández and J. F. Torres and A. Troncoso and F. Martínez-Álvarez Automated Spark clusters deployment for Big Data with standalone applications integration (Conference) CAEPIA Multiconferencia de la Asociación Española para la Inteligencia Artificial (TAMIDA VII Simposio de Teoría y Aplicaciones de Minería de Datos), Lecture Notes in Computer Science 2016. (Links | BibTeX | Tags: big data) @conference{TAMIDA2016, title = {Automated Spark clusters deployment for Big Data with standalone applications integration}, author = {A. M. Fernández and J. F. Torres and A. Troncoso and F. Martínez-Álvarez}, url = {https://link.springer.com/chapter/10.1007/978-3-319-44636-3_14}, year = {2016}, date = {2016-01-01}, booktitle = {CAEPIA Multiconferencia de la Asociación Española para la Inteligencia Artificial (TAMIDA VII Simposio de Teoría y Aplicaciones de Minería de Datos)}, series = {Lecture Notes in Computer Science}, keywords = {big data}, pubstate = {published}, tppubtype = {conference} } |
2015 |
J. Díez and O. Luaces and A. Alonso-Betanzos and A. Troncoso and A. Bahamonde Calificación de calificadores en la evaluación por pares de exámenes de respuesta abierta (Workshop) CAEPIA Multiconferencia de la Asociación Española para la Inteligencia Artificial (TAMIDA VII Simposio de Teoría y Aplicaciones de Minería de Datos), 2015. @workshop{TAMIDA2015, title = {Calificación de calificadores en la evaluación por pares de exámenes de respuesta abierta}, author = {J. Díez and O. Luaces and A. Alonso-Betanzos and A. Troncoso and A. Bahamonde}, year = {2015}, date = {2015-01-01}, booktitle = {CAEPIA Multiconferencia de la Asociación Española para la Inteligencia Artificial (TAMIDA VII Simposio de Teoría y Aplicaciones de Minería de Datos)}, keywords = {big data}, pubstate = {published}, tppubtype = {workshop} } |
O. Luaces and J. Díez and A. Alonso-Betanzos and A. Troncoso and A. Bahamonde A factorization approach to evaluate open-response assignments in MOOCs using preference learning on peer assessments (Journal Article) Knowledge-Based Systems, 85 , pp. 322-328, 2015. (Abstract | Links | BibTeX | Tags: big data) @article{KNOSYS2015, title = {A factorization approach to evaluate open-response assignments in MOOCs using preference learning on peer assessments}, author = {O. Luaces and J. Díez and A. Alonso-Betanzos and A. Troncoso and A. Bahamonde}, url = {https://www.sciencedirect.com/science/article/abs/pii/S0950705115002051}, doi = {10.1016/j.knosys.2015.05.019}, year = {2015}, date = {2015-00-00}, journal = {Knowledge-Based Systems}, volume = {85}, pages = {322-328}, abstract = {Evaluating open-response assignments in Massive Open Online Courses is a difficult task because of the huge number of students involved. Peer grading is an effective method to address this problem. There are two basic approaches in the literature: cardinal and ordinal. The first case uses grades assigned by student-graders to a set of assignments of other colleagues. In the ordinal approach, the raw materials used by grading systems are the relative orders that graders appreciate in the assignments that they evaluate. In this paper we present a factorization method that seeks a trade-off between cardinal and ordinal approaches. The algorithm learns from preference judgments to avoid the subjectivity of the numeric grades. But in addition to preferences expressed by student-graders, we include other preferences: those induced from assignments with significantly different average grades. The paper includes a report of the results obtained using this approach in a real world dataset collected in 3 Universities of Spain, A Coruña, Pablo de Olavide at Sevilla, and Oviedo at Gijón. Additionally, we studied the sensitivity of the method with respect to the number of assignments graded by each student. Our method achieves similar or better scores than staff instructors when we measure the discrepancies with other instructor’s grades.}, keywords = {big data}, pubstate = {published}, tppubtype = {article} } Evaluating open-response assignments in Massive Open Online Courses is a difficult task because of the huge number of students involved. Peer grading is an effective method to address this problem. There are two basic approaches in the literature: cardinal and ordinal. The first case uses grades assigned by student-graders to a set of assignments of other colleagues. In the ordinal approach, the raw materials used by grading systems are the relative orders that graders appreciate in the assignments that they evaluate. In this paper we present a factorization method that seeks a trade-off between cardinal and ordinal approaches. The algorithm learns from preference judgments to avoid the subjectivity of the numeric grades. But in addition to preferences expressed by student-graders, we include other preferences: those induced from assignments with significantly different average grades. The paper includes a report of the results obtained using this approach in a real world dataset collected in 3 Universities of Spain, A Coruña, Pablo de Olavide at Sevilla, and Oviedo at Gijón. Additionally, we studied the sensitivity of the method with respect to the number of assignments graded by each student. Our method achieves similar or better scores than staff instructors when we measure the discrepancies with other instructor’s grades. |
2013 |
J. Díez and O. Luaces and A. Alonso-Betanzos and A. Troncoso and A. Bahamonde Peer assessment in MOOCs using preference learning via matrix factorization (Workshop) NIPS Neural Information Processing Systems Foundation (Workshop on Data Driven Education), 2013. @workshop{NIPS2013, title = {Peer assessment in MOOCs using preference learning via matrix factorization}, author = {J. Díez and O. Luaces and A. Alonso-Betanzos and A. Troncoso and A. Bahamonde}, year = {2013}, date = {2013-01-01}, booktitle = {NIPS Neural Information Processing Systems Foundation (Workshop on Data Driven Education)}, keywords = {big data}, pubstate = {published}, tppubtype = {workshop} } |