{"id":2159459,"identifier":"ARP/VWQFD2","persistentUrl":"https://hdl.handle.net/21.15109/ARP/VWQFD2","protocol":"hdl","authority":"21.15109","separator":"/","publisher":"ARP","publicationDate":"2026-05-08","storageIdentifier":"s3-sztaki://21.15109/ARP/VWQFD2","metadataLanguage":"hu","datasetType":"dataset","datasetVersion":{"id":81045,"datasetId":2159459,"datasetPersistentId":"hdl:21.15109/ARP/VWQFD2","datasetType":"dataset","storageIdentifier":"s3-sztaki://21.15109/ARP/VWQFD2","versionNumber":2,"internalVersionNumber":6,"versionMinorNumber":0,"versionState":"RELEASED","latestVersionPublishingState":"RELEASED","productionDate":"2025-11-15","lastUpdateTime":"2026-05-12T12:55:37Z","releaseTime":"2026-05-12T12:55:37Z","createTime":"2026-05-12T12:45:42Z","publicationDate":"2026-05-08","citationDate":"2026-05-08","license":{"name":"CC BY-NC-ND 4.0","uri":"http://creativecommons.org/licenses/by-nc-nd/4.0","iconUri":"https://licensebuttons.net/l/by-nc-nd/4.0/88x31.png"},"fileAccessRequest":true,"metadataBlocks":{"citation":{"displayName":"Citation Metadata","name":"citation","fields":[{"typeName":"title","multiple":false,"typeClass":"primitive","value":"Subject categorisation experiments with AI in MTMT"},{"typeName":"author","multiple":true,"typeClass":"compound","value":[{"authorName":{"typeName":"authorName","multiple":false,"typeClass":"primitive","value":"Micsik, András"},"authorAffiliation":{"typeName":"authorAffiliation","multiple":false,"typeClass":"primitive","value":"HUN-REN SZTAKI"},"authorIdentifierScheme":{"typeName":"authorIdentifierScheme","multiple":false,"typeClass":"controlledVocabulary","value":"ORCID"},"authorIdentifier":{"typeName":"authorIdentifier","multiple":false,"typeClass":"primitive","value":"0000-0001-9859-9186"}},{"authorName":{"typeName":"authorName","multiple":false,"typeClass":"primitive","value":"Tanácsi, Roland"},"authorAffiliation":{"typeName":"authorAffiliation","multiple":false,"typeClass":"primitive","value":"HUN-REN SZTAKI"}}]},{"typeName":"datasetContact","multiple":true,"typeClass":"compound","value":[{"datasetContactName":{"typeName":"datasetContactName","multiple":false,"typeClass":"primitive","value":"Micsik, András"},"datasetContactAffiliation":{"typeName":"datasetContactAffiliation","multiple":false,"typeClass":"primitive","value":"HUN-REN SZTAKI"},"datasetContactEmail":{"typeName":"datasetContactEmail","multiple":false,"typeClass":"primitive","value":"micsik@sztaki.hu"}}]},{"typeName":"dsDescription","multiple":true,"typeClass":"compound","value":[{"dsDescriptionValue":{"typeName":"dsDescriptionValue","multiple":false,"typeClass":"primitive","value":"Code, sample data and results for subject categorisation experiments with AI in MTMT"}}]},{"typeName":"subject","multiple":true,"typeClass":"controlledVocabulary","value":["Computer and Information Science"]},{"typeName":"keyword","multiple":true,"typeClass":"compound","value":[{"keywordValue":{"typeName":"keywordValue","multiple":false,"typeClass":"primitive","value":"subject classification"}},{"keywordValue":{"typeName":"keywordValue","multiple":false,"typeClass":"primitive","value":"scientific categorization"}},{"keywordValue":{"typeName":"keywordValue","multiple":false,"typeClass":"primitive","value":"transformer models"}},{"keywordValue":{"typeName":"keywordValue","multiple":false,"typeClass":"primitive","value":"Support Vector Classifier"}},{"keywordValue":{"typeName":"keywordValue","multiple":false,"typeClass":"primitive","value":"data cleaning"}},{"keywordValue":{"typeName":"keywordValue","multiple":false,"typeClass":"primitive","value":"large language models"}}]},{"typeName":"topicClassification","multiple":true,"typeClass":"compound","value":[{"topicClassValue":{"typeName":"topicClassValue","multiple":false,"typeClass":"primitive","value":"artificial intelligence"},"topicClassVocab":{"typeName":"topicClassVocab","multiple":false,"typeClass":"primitive","value":"EuroSciVoc"},"topicClassVocabURI":{"typeName":"topicClassVocabURI","multiple":false,"typeClass":"primitive","value":"http://data.europa.eu/8mn/euroscivoc/4c8f4b46-6f5c-41d9-9079-7de85c16431d"}}]},{"typeName":"publication","multiple":true,"typeClass":"compound","value":[{"publicationCitation":{"typeName":"publicationCitation","multiple":false,"typeClass":"primitive","value":"Tanácsi, R., & Micsik, A. (2026). A Comparative Evaluation of AI Approaches to Large-Scale Scientific Subject Classification. Big Data and Cognitive Computing, 10(5), 151."},"publicationIDType":{"typeName":"publicationIDType","multiple":false,"typeClass":"controlledVocabulary","value":"doi"},"publicationIDNumber":{"typeName":"publicationIDNumber","multiple":false,"typeClass":"primitive","value":"10.3390/bdcc10050151"},"publicationURL":{"typeName":"publicationURL","multiple":false,"typeClass":"primitive","value":"https://doi.org/10.3390/bdcc10050151"},"publicationRelationType":{"typeName":"publicationRelationType","multiple":false,"typeClass":"controlledVocabulary","value":"IsSupplementTo"}}]},{"typeName":"language","multiple":true,"typeClass":"controlledVocabulary","value":["English"]},{"typeName":"productionDate","multiple":false,"typeClass":"primitive","value":"2025-11-15"},{"typeName":"grantNumber","multiple":true,"typeClass":"compound","value":[{"grantNumberAgency":{"typeName":"grantNumberAgency","multiple":false,"typeClass":"primitive","value":"NKFIH"},"grantNumberValue":{"typeName":"grantNumberValue","multiple":false,"typeClass":"primitive","value":"RRF-2.3.1-21-2022-00004"}}]},{"typeName":"depositor","multiple":false,"typeClass":"primitive","value":"Micsik, András"},{"typeName":"dateOfDeposit","multiple":false,"typeClass":"primitive","value":"2026-02-03"},{"typeName":"software","multiple":true,"typeClass":"compound","value":[{"softwareName":{"typeName":"softwareName","multiple":false,"typeClass":"primitive","value":"Python"},"softwareVersion":{"typeName":"softwareVersion","multiple":false,"typeClass":"primitive","value":"3.10"}}]}]}},"files":[{"label":"README.txt","restricted":false,"version":1,"datasetVersionId":81045,"dataFile":{"id":2163827,"persistentId":"hdl:21.15109/ARP/VWQFD2/OLMFQR","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/OLMFQR","filename":"README.txt","contentType":"text/plain","friendlyType":"Plain Text","filesize":317,"storageIdentifier":"s3-sztaki://concorda:19def14ecf6-9b96af192434","rootDataFileId":-1,"md5":"5a9d1ab8e4fa78ff465f69453cf772e0","checksum":{"type":"MD5","value":"5a9d1ab8e4fa78ff465f69453cf772e0"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"annif.csv","restricted":false,"directoryLabel":"results","version":1,"datasetVersionId":81045,"dataFile":{"id":2163831,"persistentId":"hdl:21.15109/ARP/VWQFD2/NF3M7P","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/NF3M7P","filename":"annif.csv","contentType":"text/csv","friendlyType":"Comma Separated Values","filesize":2267,"storageIdentifier":"s3-sztaki://concorda:19def14ed4c-22b64ccf0d86","rootDataFileId":-1,"md5":"8e3a64ace616bcb168b6e4f7fb2f282e","checksum":{"type":"MD5","value":"8e3a64ace616bcb168b6e4f7fb2f282e"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","directoryLabel":"results","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"embedding_scikit.csv","restricted":false,"directoryLabel":"results","version":1,"datasetVersionId":81045,"dataFile":{"id":2163822,"persistentId":"hdl:21.15109/ARP/VWQFD2/TUU2A3","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/TUU2A3","filename":"embedding_scikit.csv","contentType":"text/csv","friendlyType":"Comma Separated Values","filesize":4075,"storageIdentifier":"s3-sztaki://concorda:19def14ed76-96578ee77634","rootDataFileId":-1,"md5":"b4c198539305ae175ad8a4fced2a604a","checksum":{"type":"MD5","value":"b4c198539305ae175ad8a4fced2a604a"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","directoryLabel":"results","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"eval_svm_rbf.py","restricted":false,"directoryLabel":"src","version":1,"datasetVersionId":81045,"dataFile":{"id":2163820,"persistentId":"hdl:21.15109/ARP/VWQFD2/GQ2QTF","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/GQ2QTF","filename":"eval_svm_rbf.py","contentType":"text/x-python","friendlyType":"Python Source Code","filesize":6446,"storageIdentifier":"s3-sztaki://concorda:19def14ef6e-22e5dcc2e33e","rootDataFileId":-1,"md5":"6aadfcc714f0bba46bae9b7bb47fcb9c","checksum":{"type":"MD5","value":"6aadfcc714f0bba46bae9b7bb47fcb9c"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","directoryLabel":"src","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"frascati_mapping.json","restricted":false,"directoryLabel":"sample_data","version":1,"datasetVersionId":81045,"dataFile":{"id":2163821,"persistentId":"hdl:21.15109/ARP/VWQFD2/UMCK7O","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/UMCK7O","filename":"frascati_mapping.json","contentType":"application/json","friendlyType":"JSON","filesize":2394,"storageIdentifier":"s3-sztaki://concorda:19def14ee6a-e94d2c6b0e0e","rootDataFileId":-1,"md5":"8a2a440fd99bf426350a6ed7a327dbfe","checksum":{"type":"MD5","value":"8a2a440fd99bf426350a6ed7a327dbfe"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","directoryLabel":"sample_data","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"lvl4-mtmt-large-multiclass-svm-rbf.zip","restricted":false,"directoryLabel":"model","version":1,"datasetVersionId":81045,"dataFile":{"id":2163836,"persistentId":"hdl:21.15109/ARP/VWQFD2/RSDNVR","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/RSDNVR","filename":"lvl4-mtmt-large-multiclass-svm-rbf.zip","contentType":"application/octet-stream","friendlyType":"Unknown","filesize":2015902080,"storageIdentifier":"s3-sztaki://concorda:19def6c795d-3d81f8a341ec","rootDataFileId":-1,"md5":"9e6b343b55a52c2c09c489aeaacc5af4","checksum":{"type":"MD5","value":"9e6b343b55a52c2c09c489aeaacc5af4"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","directoryLabel":"model","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"requirements.txt","restricted":false,"directoryLabel":"src","version":1,"datasetVersionId":81045,"dataFile":{"id":2163914,"persistentId":"hdl:21.15109/ARP/VWQFD2/YSHVT5","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/YSHVT5","filename":"requirements.txt","contentType":"text/plain","friendlyType":"Plain Text","filesize":182,"storageIdentifier":"s3-sztaki://concorda:19e1c4141ac-65a315e6ccc3","rootDataFileId":-1,"md5":"ace08c9b245683be998b0fdc19ac8581","checksum":{"type":"MD5","value":"ace08c9b245683be998b0fdc19ac8581"},"tabularData":false,"creationDate":"2026-05-12","publicationDate":"2026-05-12","directoryLabel":"src","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"sample_evaluation_data.csv","restricted":false,"directoryLabel":"sample_data","version":1,"datasetVersionId":81045,"dataFile":{"id":2163835,"persistentId":"hdl:21.15109/ARP/VWQFD2/WF4GHD","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/WF4GHD","filename":"sample_evaluation_data.csv","contentType":"text/csv","friendlyType":"Comma Separated Values","filesize":25015,"storageIdentifier":"s3-sztaki://concorda:19def14ee85-16d4ebc37513","rootDataFileId":-1,"md5":"97a5cb0fdf30662e8b286e446fc71590","checksum":{"type":"MD5","value":"97a5cb0fdf30662e8b286e446fc71590"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","directoryLabel":"sample_data","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"sample_evaluation_data.npy","restricted":false,"directoryLabel":"sample_data","version":1,"datasetVersionId":81045,"dataFile":{"id":2163834,"persistentId":"hdl:21.15109/ARP/VWQFD2/2NLZ4Q","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/2NLZ4Q","filename":"sample_evaluation_data.npy","contentType":"application/octet-stream","friendlyType":"Unknown","filesize":1818752,"storageIdentifier":"s3-sztaki://concorda:19def14ee95-0ec4f09612a5","rootDataFileId":-1,"md5":"afa079c4bd87d0723929dedb2f148443","checksum":{"type":"MD5","value":"afa079c4bd87d0723929dedb2f148443"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","directoryLabel":"sample_data","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"sample_training_data.csv","restricted":false,"directoryLabel":"sample_data","version":1,"datasetVersionId":81045,"dataFile":{"id":2163823,"persistentId":"hdl:21.15109/ARP/VWQFD2/SKUVFP","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/SKUVFP","filename":"sample_training_data.csv","contentType":"text/csv","friendlyType":"Comma Separated Values","filesize":59377,"storageIdentifier":"s3-sztaki://concorda:19def14eee4-2f6441cd19fe","rootDataFileId":-1,"md5":"48cdb73aeb915b85ea0ef38140354417","checksum":{"type":"MD5","value":"48cdb73aeb915b85ea0ef38140354417"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","directoryLabel":"sample_data","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"sample_training_data.npy","restricted":false,"directoryLabel":"sample_data","version":1,"datasetVersionId":81045,"dataFile":{"id":2163833,"persistentId":"hdl:21.15109/ARP/VWQFD2/2Q93LH","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/2Q93LH","filename":"sample_training_data.npy","contentType":"application/octet-stream","friendlyType":"Unknown","filesize":4243584,"storageIdentifier":"s3-sztaki://concorda:19def14ef09-d8cba9909b25","rootDataFileId":-1,"md5":"e01b3016acc80a112d2903ddf41d60d3","checksum":{"type":"MD5","value":"e01b3016acc80a112d2903ddf41d60d3"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","directoryLabel":"sample_data","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"scibert_lvl3.csv","restricted":false,"directoryLabel":"results","version":1,"datasetVersionId":81045,"dataFile":{"id":2163826,"persistentId":"hdl:21.15109/ARP/VWQFD2/W4KTNW","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/W4KTNW","filename":"scibert_lvl3.csv","contentType":"text/csv","friendlyType":"Comma Separated Values","filesize":3258,"storageIdentifier":"s3-sztaki://concorda:19def14ed99-6698ca45fc7a","rootDataFileId":-1,"md5":"1c9d9644d1de08e7f1fb85f0333f946b","checksum":{"type":"MD5","value":"1c9d9644d1de08e7f1fb85f0333f946b"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","directoryLabel":"results","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"scibert_lvl4.csv","restricted":false,"directoryLabel":"results","version":1,"datasetVersionId":81045,"dataFile":{"id":2163830,"persistentId":"hdl:21.15109/ARP/VWQFD2/AKX2F6","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/AKX2F6","filename":"scibert_lvl4.csv","contentType":"text/csv","friendlyType":"Comma Separated Values","filesize":3288,"storageIdentifier":"s3-sztaki://concorda:19def14edba-b7fe2b1bb578","rootDataFileId":-1,"md5":"9aeb860bf739ed09e094dbb9c7dc72e3","checksum":{"type":"MD5","value":"9aeb860bf739ed09e094dbb9c7dc72e3"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","directoryLabel":"results","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"scibert_lvl4_subtopics.csv","restricted":false,"directoryLabel":"results","version":1,"datasetVersionId":81045,"dataFile":{"id":2163824,"persistentId":"hdl:21.15109/ARP/VWQFD2/S6CBTZ","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/S6CBTZ","filename":"scibert_lvl4_subtopics.csv","contentType":"text/csv","friendlyType":"Comma Separated Values","filesize":12681,"storageIdentifier":"s3-sztaki://concorda:19def14ee23-7a60dc9dc466","rootDataFileId":-1,"md5":"11cea9e8926e8bfc8be57226786c441a","checksum":{"type":"MD5","value":"11cea9e8926e8bfc8be57226786c441a"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","directoryLabel":"results","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"scibert_moe.csv","restricted":false,"directoryLabel":"results","version":1,"datasetVersionId":81045,"dataFile":{"id":2163828,"persistentId":"hdl:21.15109/ARP/VWQFD2/WMCKQO","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/WMCKQO","filename":"scibert_moe.csv","contentType":"text/csv","friendlyType":"Comma Separated Values","filesize":15114,"storageIdentifier":"s3-sztaki://concorda:19def14ee47-b3b3d74dc65d","rootDataFileId":-1,"md5":"2d789be516ce3c0cca8c9505a17dcd1c","checksum":{"type":"MD5","value":"2d789be516ce3c0cca8c9505a17dcd1c"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","directoryLabel":"results","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"svm_rbf_confusion_matrix_percent.csv","restricted":false,"directoryLabel":"model","version":1,"datasetVersionId":81045,"dataFile":{"id":2163829,"persistentId":"hdl:21.15109/ARP/VWQFD2/MV8MX6","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/MV8MX6","filename":"svm_rbf_confusion_matrix_percent.csv","contentType":"text/csv","friendlyType":"Comma Separated Values","filesize":5867,"storageIdentifier":"s3-sztaki://concorda:19def14ed29-125582debc75","rootDataFileId":-1,"md5":"9836e85f21382168c3fa60d8bb5463eb","checksum":{"type":"MD5","value":"9836e85f21382168c3fa60d8bb5463eb"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","directoryLabel":"model","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}},{"label":"train_svm_rbf.py","restricted":false,"directoryLabel":"src","version":1,"datasetVersionId":81045,"dataFile":{"id":2163832,"persistentId":"hdl:21.15109/ARP/VWQFD2/4BGUBA","pidURL":"https://hdl.handle.net/21.15109/ARP/VWQFD2/4BGUBA","filename":"train_svm_rbf.py","contentType":"text/x-python","friendlyType":"Python Source Code","filesize":4807,"storageIdentifier":"s3-sztaki://concorda:19def14efb8-a192a84d24f9","rootDataFileId":-1,"md5":"6cb2735f2e906219ed25a9e14ad21555","checksum":{"type":"MD5","value":"6cb2735f2e906219ed25a9e14ad21555"},"tabularData":false,"creationDate":"2026-05-03","publicationDate":"2026-05-08","directoryLabel":"src","lastUpdateTime":"2026-05-12T12:55:37Z","fileAccessRequest":true}}],"citation":"Micsik, András; Tanácsi, Roland, 2026, \"Subject categorisation experiments with AI in MTMT\", https://hdl.handle.net/21.15109/ARP/VWQFD2, ARP, V2"}}