author = "Pinheiro, Gabriel Augusto Lins Leal and Silva, Juarez L. F. da 
                         Silva and Soares, Marinalva D. and Quiles, Marcos 
          affiliation = "{Instituto Nacional de Pesquisas Espaciais (INPE)} and 
                         {Universidade de S{\~a}o Paulo (USP)} and {Universidade Federal 
                         de S{\~a}o Paulo (UNIFESP)} and {Universidade Federal de S{\~a}o 
                         Paulo (UNIFESP)}",
                title = "A graph-based clustering analysis of the QM9 dataset via SMILES 
            booktitle = "Proceedings...",
                 year = "2020",
               editor = "Gervasi, O. and Murgante, B. and Misra, S. and Garau, C. and 
                         Blecic, I. and Taniar, D. and Apduhan, B. O. and Rocha, A. M. A. 
                         C. and Tarantino, E. and Torre, C. M. and Karaca, Y.",
                pages = "421--433",
         organization = "International Conference on Computational Science and Its 
                         Applications (ICCSA), 20.",
            publisher = "Springer",
                 note = "Lecture Notes in Computer Science, v.12249",
             keywords = "Clustering  Graph  Quantum-chemistry.",
             abstract = "Machine learning has become a new hot-topic in Materials Sciences. 
                         For instance, several approaches from unsupervised and supervised 
                         learning have been applied as surrogate models to study the 
                         properties of several classes of materials. Here, we investigate, 
                         from a graphbased clustering perspective, the Quantum QM9 dataset. 
                         This dataset is one of the most used datasets in this scenario. 
                         Our investigation is twofold: 1) understand whether the QM9 
                         samples are organized in clusters, and 2) if the clustering 
                         structure might provide us with some insights regarding anomalous 
                         molecules, or molecules that jeopardize the accuracy of supervised 
                         property prediction methods. Our results show that the QM9 is 
                         indeed structured into clusters. These clusters, for instance, 
                         might suggest better approaches for splitting the dataset when 
                         using cross-correlation approaches in supervised learning. 
                         However, regarding our second question, our finds indicate that 
                         the clustering structure, obtained via Simplified Molecular Input 
                         Line Entry System (SMILES) representation, cannot be used to 
                         filter anomalous samples in property prediction. Thus, further 
                         investigation regarding this limitation should be conducted in 
                         future research.",
  conference-location = "Cagliari, Italy",
      conference-year = "01-04 July",
                  doi = "10.1007/978-3-030-58799-4_74",
                  url = "http://dx.doi.org/10.1007/978-3-030-58799-4_74",
                 isbn = "978-303058798-7",
                 issn = "03029743",
             language = "en",
           targetfile = "pinheiro_graph.pdf",
        urlaccessdate = "28 nov. 2020"