{"id":507232,"identifier":"ARP/FBIIOZ","persistentUrl":"https://hdl.handle.net/21.15109/ARP/FBIIOZ","protocol":"hdl","authority":"21.15109","publisher":"ARP","publicationDate":"2024-12-06","storageIdentifier":"s3-sztaki://21.15109/ARP/FBIIOZ","metadataLanguage":"en","datasetVersion":{"id":2613,"datasetId":507232,"datasetPersistentId":"hdl:21.15109/ARP/FBIIOZ","storageIdentifier":"s3-sztaki://21.15109/ARP/FBIIOZ","versionNumber":1,"versionMinorNumber":1,"versionState":"RELEASED","lastUpdateTime":"2024-12-06T08:41:25Z","releaseTime":"2024-12-06T08:41:25Z","createTime":"2024-12-06T08:38:37Z","publicationDate":"2024-12-06","citationDate":"2024-12-06","license":{"name":"CC BY-NC 4.0","uri":"http://creativecommons.org/licenses/by-nc/4.0","iconUri":"https://licensebuttons.net/l/by-nc/4.0/88x31.png"},"fileAccessRequest":true,"metadataBlocks":{"citation":{"displayName":"Citation Metadata","name":"citation","fields":[{"typeName":"title","multiple":false,"typeClass":"primitive","value":"Anonymized netflow and security scan data"},{"typeName":"author","multiple":true,"typeClass":"compound","value":[{"authorName":{"typeName":"authorName","multiple":false,"typeClass":"primitive","value":"Rigó, Ernő"},"authorAffiliation":{"typeName":"authorAffiliation","multiple":false,"typeClass":"primitive","value":"SZTAKI staff"},"authorIdentifierScheme":{"typeName":"authorIdentifierScheme","multiple":false,"typeClass":"controlledVocabulary","value":"ORCID"},"authorIdentifier":{"typeName":"authorIdentifier","multiple":false,"typeClass":"primitive","value":"0000-0003-1044-7167"}}]},{"typeName":"datasetContact","multiple":true,"typeClass":"compound","value":[{"datasetContactName":{"typeName":"datasetContactName","multiple":false,"typeClass":"primitive","value":"Rigó, Ernő"},"datasetContactAffiliation":{"typeName":"datasetContactAffiliation","multiple":false,"typeClass":"primitive","value":"SZTAKI staff"},"datasetContactEmail":{"typeName":"datasetContactEmail","multiple":false,"typeClass":"primitive","value":"rigo.erno@sztaki.hu"}}]},{"typeName":"dsDescription","multiple":true,"typeClass":"compound","value":[{"dsDescriptionValue":{"typeName":"dsDescriptionValue","multiple":false,"typeClass":"primitive","value":"<b>Description</b><br>\n<br>\nThis dataset contains network traffic and vulnerability scan reports for networks with different characteristics:<br>\n<ul>\n<li>vlan11 is a public network with low traffic and ~30 hosts</li>\n<li>cloud is a public network with moderate traffic and ~100 hosts from a cloud environment</li>\n<li>vlan23 is a private network with high traffic and ~200 hosts</li>\n</ul>\n<br>\n<b>Data formats</b><br>\n<br>\n<ul>\n<li>netflow data is presented in (CSV, JSON, RAW) formats for 30 day period</li>\n<li>security scan reports are presented in (CSV, filtered CSV, HTML, XML) formats</li>\n</ul>\n<br>\nData is compressed in may cases for preserving repository space and network bandwidth. Uncompress with <code>xz</code><br>\n<br>\n<b>Anonymization</b><br>\n<br>\nThe anonymized dataset comprises a collection of network traffic and domain-related information derived from the described environments.<br>\n<br>\nThe source information includes sensitive IPv4 addresses and domain hostnames, vital for network analysis, vulnerability assessments, and security research.<br>\nHowever, due to the sensitive nature of the data, anonymization is employed to protect personal and organizational privacy.<br>\n<br>\n<b>Anonymization Methodology</b><br>\n<br>\nTo ensure privacy while retaining the dataset's analytical value, the following anonymization techniques are applied:<br>\n<br>\nThe main objective is to maintain the utility of network patterns and relationships while masking specific addresses to prevent any form of trace-back to individual devices or networks.<br>\n<br>\n<b>IPv4 Address Anonymization</b><br>\n<br>\nEach IPv4 address in the dataset has its first two octets anonymized, using a consistent mapping system that replaces these octets with random, uniquely assigned numbers.<br>\nThis transformation is deterministic, meaning that the same original address segments always map to the same anonymized segments, thus preserving relationships and patterns critical for analysis.<br>\n<br>\n<b>Domain Name Anonymization</b><br>\n<br>\nThe hostnames within domain names are anonymized by substituting them with a randomly generated string.<br>\nThese new hostnames follow a structured anonymized format: &lt;randomname&gt;.random.xyz.<br>\n<br>\nSimilar to IP anonymization, the mapping is consistent across the dataset, ensuring that each original hostname is consistently replaced with the same anonymized version.<br>\n<br>\n<b>Privacy Considerations</b><br>\n<ul>\n<li>Consistency: The anonymization process employs a reproducible mapping system, ensuring that every occurrence of a unique IP address segment or domain hostname is anonymized identically across the dataset. This consistency allows for meaningful analysis of trends and repeated interactions without exposing raw data.</li>\n<li>Data Integrity: By focusing the anonymization on specific segments of IP addresses and hostnames, the overall structure of the data remains intact. This integrity is crucial for operations such as network flow analysis and anomaly detection, which rely on the continuity of data patterns.</li>\n<li>Data Minimization: Alongside anonymizing critical fields, the dataset also undergoes a process of column removal, where non-essential fields that might contain sensitive information are excluded. This further reduces the risk of unintended information exposure.</li>\n</ul>"},"dsDescriptionDate":{"typeName":"dsDescriptionDate","multiple":false,"typeClass":"primitive","value":"2023-06-01"}}]},{"typeName":"subject","multiple":true,"typeClass":"controlledVocabulary","value":["Computer and Information Science"]},{"typeName":"notesText","multiple":false,"typeClass":"primitive","value":"For additional information see README.md"},{"typeName":"depositor","multiple":false,"typeClass":"primitive","value":"Rigó, Ernő"},{"typeName":"dateOfDeposit","multiple":false,"typeClass":"primitive","value":"2024-12-06"}]},"geospatial":{"displayName":"Geospatial Metadata","name":"geospatial","fields":[]},"journal":{"displayName":"Journal Metadata","name":"journal","fields":[]}},"files":[{"description":"additional description of the datasets and anonymization methods","label":"README.md","restricted":false,"version":1,"datasetVersionId":2613,"dataFile":{"id":507240,"persistentId":"hdl:21.15109/ARP/FBIIOZ/SXLDNE","pidURL":"https://hdl.handle.net/21.15109/ARP/FBIIOZ/SXLDNE","filename":"README.md","contentType":"text/markdown","friendlyType":"Markdown Text","filesize":3201,"description":"additional description of the datasets and anonymization methods","storageIdentifier":"s3-sztaki://concorda:1939b1875fd-cb0ddca4259e","rootDataFileId":-1,"md5":"06cc03198a6b30df611d81c583369abc","checksum":{"type":"MD5","value":"06cc03198a6b30df611d81c583369abc"},"tabularData":false,"creationDate":"2024-12-06","publicationDate":"2024-12-06","fileAccessRequest":true}},{"description":"aggregated anonymized netflow data for cloud vlan","label":"netflow-aggr-cloud.csv.xz","restricted":false,"version":1,"datasetVersionId":2613,"dataFile":{"id":507238,"persistentId":"hdl:21.15109/ARP/FBIIOZ/CH1BH6","pidURL":"https://hdl.handle.net/21.15109/ARP/FBIIOZ/CH1BH6","filename":"netflow-aggr-cloud.csv.xz","contentType":"application/x-xz","friendlyType":"XZ Archive","filesize":559656,"description":"aggregated anonymized netflow data for cloud vlan","storageIdentifier":"s3-sztaki://concorda:1939b18744d-7733436bdcb8","rootDataFileId":-1,"md5":"30646322ecdd3a9f402ead40b1f360e2","checksum":{"type":"MD5","value":"30646322ecdd3a9f402ead40b1f360e2"},"tabularData":false,"creationDate":"2024-12-06","publicationDate":"2024-12-06","fileAccessRequest":true}},{"description":"aggregated anonymized netflow data for vlan11","label":"netflow-aggr-vlan11.csv.xz","restricted":false,"version":1,"datasetVersionId":2613,"dataFile":{"id":507235,"persistentId":"hdl:21.15109/ARP/FBIIOZ/SEIV5E","pidURL":"https://hdl.handle.net/21.15109/ARP/FBIIOZ/SEIV5E","filename":"netflow-aggr-vlan11.csv.xz","contentType":"application/x-xz","friendlyType":"XZ Archive","filesize":15170660,"description":"aggregated anonymized netflow data for vlan11","storageIdentifier":"s3-sztaki://concorda:1939b187207-b077ae1d1f49","rootDataFileId":-1,"md5":"f5484240762ef633514aa9912f6271a4","checksum":{"type":"MD5","value":"f5484240762ef633514aa9912f6271a4"},"tabularData":false,"creationDate":"2024-12-06","publicationDate":"2024-12-06","fileAccessRequest":true}},{"description":"aggregated anonymized netflow data for vlan23","label":"netflow-aggr-vlan23-filtered.csv.xz","restricted":false,"version":1,"datasetVersionId":2613,"dataFile":{"id":507239,"persistentId":"hdl:21.15109/ARP/FBIIOZ/LBKZZZ","pidURL":"https://hdl.handle.net/21.15109/ARP/FBIIOZ/LBKZZZ","filename":"netflow-aggr-vlan23-filtered.csv.xz","contentType":"application/x-xz","friendlyType":"XZ Archive","filesize":36220,"description":"aggregated anonymized netflow data for vlan23","storageIdentifier":"s3-sztaki://concorda:1939b186e53-a5645e88c6c9","rootDataFileId":-1,"md5":"4e1c7d87008c71e24d0d8c1134dece83","checksum":{"type":"MD5","value":"4e1c7d87008c71e24d0d8c1134dece83"},"tabularData":false,"creationDate":"2024-12-06","publicationDate":"2024-12-06","fileAccessRequest":true}},{"description":"anonymized security scan data for cloud vlan","label":"scan-report-cloud.csv.xz","restricted":false,"version":1,"datasetVersionId":2613,"dataFile":{"id":507234,"persistentId":"hdl:21.15109/ARP/FBIIOZ/VJFBPS","pidURL":"https://hdl.handle.net/21.15109/ARP/FBIIOZ/VJFBPS","filename":"scan-report-cloud.csv.xz","contentType":"application/x-xz","friendlyType":"XZ Archive","filesize":31688,"description":"anonymized security scan data for cloud vlan","storageIdentifier":"s3-sztaki://concorda:1939b18752b-8e410cf9380d","rootDataFileId":-1,"md5":"d3e16353da13b2b9e7d5bbc3ef6ddfdf","checksum":{"type":"MD5","value":"d3e16353da13b2b9e7d5bbc3ef6ddfdf"},"tabularData":false,"creationDate":"2024-12-06","publicationDate":"2024-12-06","fileAccessRequest":true}},{"description":"anonymized security scan data for vlan11","label":"scan-report-vlan11.csv.xz","restricted":false,"version":1,"datasetVersionId":2613,"dataFile":{"id":507233,"persistentId":"hdl:21.15109/ARP/FBIIOZ/7JHQHE","pidURL":"https://hdl.handle.net/21.15109/ARP/FBIIOZ/7JHQHE","filename":"scan-report-vlan11.csv.xz","contentType":"application/x-xz","friendlyType":"XZ Archive","filesize":29984,"description":"anonymized security scan data for vlan11","storageIdentifier":"s3-sztaki://concorda:1939b18731e-6ea5fa224faf","rootDataFileId":-1,"md5":"a06e462543454cdd8e62823c5f310412","checksum":{"type":"MD5","value":"a06e462543454cdd8e62823c5f310412"},"tabularData":false,"creationDate":"2024-12-06","publicationDate":"2024-12-06","fileAccessRequest":true}},{"description":"anonymized security scan data for vlan23","label":"scan-report-vlan23.csv.xz","restricted":false,"version":1,"datasetVersionId":2613,"dataFile":{"id":507237,"persistentId":"hdl:21.15109/ARP/FBIIOZ/KRGKQS","pidURL":"https://hdl.handle.net/21.15109/ARP/FBIIOZ/KRGKQS","filename":"scan-report-vlan23.csv.xz","contentType":"application/x-xz","friendlyType":"XZ Archive","filesize":48360,"description":"anonymized security scan data for vlan23","storageIdentifier":"s3-sztaki://concorda:1939b186f74-0d4c0c73f848","rootDataFileId":-1,"md5":"b4dedd49cc89a94bdf633756dd5723df","checksum":{"type":"MD5","value":"b4dedd49cc89a94bdf633756dd5723df"},"tabularData":false,"creationDate":"2024-12-06","publicationDate":"2024-12-06","fileAccessRequest":true}}],"citation":"Rigó, Ernő, 2024, \"Anonymized netflow and security scan data\", https://hdl.handle.net/21.15109/ARP/FBIIOZ, ARP, V1"}}