The code uses the 2012 Adobe's Malware Classifier as a reference and modified accordingly for my own purpose.
The code uses PEFile python module to read and work with PE files.
My Purpose:
a) Extraction of PE file Header and Section information.
b) Compute MD5 sum of binary.
c) Save the information in a CSV file.
First import the required python library.
import os import pefile import pandas as pd import hashlib
Next define the function for computing md5 value given the filename. *Filename requires full path if binary is located in a separate directory from the python script.
def md5sum(filename, blocksize=65536): hash = hashlib.md5() with open(filename, "rb") as f: for block in iter(lambda: f.read(blocksize), b""): hash.update(block) return hash.hexdigest()
Next define a python class to represent the PE binary file.
class PEFile: #representation of PE file def __init__(self, filename): with open(filename, "rb") as file_content: self.pe= pefile.PE(data=file_content.read(), fast_load=True) self.filename = filename self.DebugSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size self.DebugRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress self.ImageVersion = self.pe.OPTIONAL_HEADER.MajorImageVersion self.OSVersion = self.pe.OPTIONAL_HEADER.MajorOperatingSystemVersion self.ExportRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress self.ExportSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size self.IATRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress self.ResSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size self.LinkerVersion = self.pe.OPTIONAL_HEADER.MajorLinkerVersion self.VirtualSize2 = self.pe.sections[1].Misc_VirtualSize self.NumberOfSections = self.pe.FILE_HEADER.NumberOfSections self.StackReserveSize = self.pe.OPTIONAL_HEADER.SizeOfStackReserve self.Dll = self.pe.OPTIONAL_HEADER.DllCharacteristics self.pe.parse_data_directories() countf = 0 countm = 0 for entry in self.pe.DIRECTORY_ENTRY_IMPORT: #print (entry.dll) countf += 1 for imp in entry.imports: #print ('\t', hex(imp.address), imp.name) countm += 1 self.ImportFunctionCount = countf self.ImportFunctionMethodCount = countm file_content.close() self.md5hash = md5sum(filename) print ("Loaded PE File") def Construct(self): sample = {} for feature, value in self.__dict__.items(): if(feature != "pe"): sample[feature] = value print ("Construct completed") return sample
Method to iterate through a directory for PE Files. PE Header and Section information are extracted and stored in a python dictionary and returned.
def pe2vec(): #dataset is a python dictionary which store the key value mapping dataset = {} #Recursively search for files within a specified directory and its subdir directory = "C:\\Users\\chia0\\Downloads\\dissertation\\malware\\malware\\metasploit_samples\\temp" for subdir, dirs, files in os.walk(directory): for f in files: file_path = os.path.join(subdir, f) try: #read PE file using PEFILE module pe = PEFile(file_path) # pe.construct returns a dictionary with features as key and feature value as value dataset[str(f)] = pe.Construct() except Exception as e: print (e) return dataset
Method to save python dictionary returned by pe2vec() function to a CSV file.
def saveToCSV(dataset): df = pd.DataFrame(dataset) infected = df.transpose() infected.to_csv('C:\\Users\\chia0\\Downloads\\dissertation\\malware\\dataset_m641.csv',sep=',', encoding='utf-8')
Full Code is shown below:
import os import pefile import pandas as pd import hashlib def md5sum(filename, blocksize=65536): hash = hashlib.md5() with open(filename, "rb") as f: for block in iter(lambda: f.read(blocksize), b""): hash.update(block) return hash.hexdigest() class PEFile: #representation of PE file def __init__(self, filename): with open(filename, "rb") as file_content: self.pe= pefile.PE(data=file_content.read(), fast_load=True) self.filename = filename self.DebugSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size self.DebugRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress self.ImageVersion = self.pe.OPTIONAL_HEADER.MajorImageVersion self.OSVersion = self.pe.OPTIONAL_HEADER.MajorOperatingSystemVersion self.ExportRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress self.ExportSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size self.IATRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress self.ResSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size self.LinkerVersion = self.pe.OPTIONAL_HEADER.MajorLinkerVersion self.VirtualSize2 = self.pe.sections[1].Misc_VirtualSize self.NumberOfSections = self.pe.FILE_HEADER.NumberOfSections self.StackReserveSize = self.pe.OPTIONAL_HEADER.SizeOfStackReserve self.Dll = self.pe.OPTIONAL_HEADER.DllCharacteristics self.pe.parse_data_directories() countf = 0 countm = 0 for entry in self.pe.DIRECTORY_ENTRY_IMPORT: #print (entry.dll) countf += 1 for imp in entry.imports: #print ('\t', hex(imp.address), imp.name) countm += 1 self.ImportFunctionCount = countf self.ImportFunctionMethodCount = countm file_content.close() self.md5hash = md5sum(filename) print ("Loaded PE File") def Construct(self): sample = {} for feature, value in self.__dict__.items(): if(feature != "pe"): sample[feature] = value print ("Construct completed") return sample def pe2vec(): #dataset is a python dictionary which store the key value mapping dataset = {} #Recursively search for files within a specified directory and its subdir directory = "C:\\Users\\chia0\\Downloads\\dissertation\\malware\\malware\\metasploit_samples\\temp" for subdir, dirs, files in os.walk(directory): for f in files: file_path = os.path.join(subdir, f) try: #read PE file using PEFILE module pe = PEFile(file_path) # pe.construct returns a dictionary with features as key and feature value as value dataset[str(f)] = pe.Construct() except Exception as e: print (e) return dataset def saveToCSV(dataset): df = pd.DataFrame(dataset) infected = df.transpose() infected.to_csv('C:\\Users\\chia0\\Downloads\\dissertation\\malware\\dataset_m641.csv',sep=',', encoding='utf-8') pedata = pe2vec() saveToCSV(pedata)