Wednesday, August 29, 2018

Feature extraction of PEFiles using Python

This post serves as a reminder on how to perform feature extraction of PE binaries using Python.
The code uses the 2012 Adobe's Malware Classifier as a reference and modified accordingly for my own purpose. 
The code uses PEFile python module to read and work with PE files. 
 
My Purpose:
a) Extraction of PE file Header and Section information.
b) Compute MD5 sum of binary.
c) Save the information in a CSV file.

First import the required python library. 
import os
import pefile
import pandas as pd
import hashlib

Next define the function for computing md5 value given the filename. *Filename requires full path if binary is located in a separate directory from the python script.
def md5sum(filename, blocksize=65536):
    hash = hashlib.md5()
    with open(filename, "rb") as f:
        for block in iter(lambda: f.read(blocksize), b""):
            hash.update(block)
    return hash.hexdigest()

Next define a python class to represent the PE binary file.
class PEFile:
    #representation of PE file
    def __init__(self, filename):
        with open(filename, "rb") as file_content:
            self.pe= pefile.PE(data=file_content.read(), fast_load=True)
        self.filename = filename
        self.DebugSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size
        self.DebugRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress
        self.ImageVersion = self.pe.OPTIONAL_HEADER.MajorImageVersion
        self.OSVersion = self.pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
        self.ExportRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress
        self.ExportSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size
        self.IATRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress
        self.ResSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size
        self.LinkerVersion = self.pe.OPTIONAL_HEADER.MajorLinkerVersion
        self.VirtualSize2 = self.pe.sections[1].Misc_VirtualSize
        self.NumberOfSections = self.pe.FILE_HEADER.NumberOfSections
        self.StackReserveSize = self.pe.OPTIONAL_HEADER.SizeOfStackReserve
        self.Dll = self.pe.OPTIONAL_HEADER.DllCharacteristics
        self.pe.parse_data_directories()
        countf = 0
        countm = 0
        for entry in self.pe.DIRECTORY_ENTRY_IMPORT:
          #print (entry.dll)
            countf += 1
            for imp in entry.imports:
                #print ('\t', hex(imp.address), imp.name)
                countm += 1
        self.ImportFunctionCount = countf
        self.ImportFunctionMethodCount = countm
        file_content.close()
        self.md5hash = md5sum(filename)
        print ("Loaded PE File")
        
    def Construct(self):
        sample = {}
        for feature, value in self.__dict__.items():
            if(feature != "pe"):
                sample[feature] = value
        print ("Construct completed")
        return sample

Method to iterate through a directory for PE Files. PE Header and Section information are extracted and stored in a python dictionary and returned.
def pe2vec():
    #dataset is a python dictionary which store the key value mapping
    dataset = {}
    
    #Recursively search for files within a specified directory and its subdir
    directory = "C:\\Users\\chia0\\Downloads\\dissertation\\malware\\malware\\metasploit_samples\\temp"
    for subdir, dirs, files in os.walk(directory):
        for f in files:
            file_path = os.path.join(subdir, f)
            try:
                #read PE file using PEFILE module
                pe = PEFile(file_path)        
                # pe.construct returns a dictionary with features as key and feature value as value
                dataset[str(f)] = pe.Construct()
            except Exception as e:
                print (e)
    return dataset

Method to save python dictionary returned by pe2vec() function to a CSV file.
def saveToCSV(dataset):
    df = pd.DataFrame(dataset)
    infected = df.transpose()  
    infected.to_csv('C:\\Users\\chia0\\Downloads\\dissertation\\malware\\dataset_m641.csv',sep=',', encoding='utf-8')
    


Full Code is shown below:
import os
import pefile
import pandas as pd
import hashlib

def md5sum(filename, blocksize=65536):
    hash = hashlib.md5()
    with open(filename, "rb") as f:
        for block in iter(lambda: f.read(blocksize), b""):
            hash.update(block)
    return hash.hexdigest()

class PEFile:
    #representation of PE file
    def __init__(self, filename):
        with open(filename, "rb") as file_content:
            self.pe= pefile.PE(data=file_content.read(), fast_load=True)
        self.filename = filename
        self.DebugSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size
        self.DebugRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress
        self.ImageVersion = self.pe.OPTIONAL_HEADER.MajorImageVersion
        self.OSVersion = self.pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
        self.ExportRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress
        self.ExportSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size
        self.IATRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress
        self.ResSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size
        self.LinkerVersion = self.pe.OPTIONAL_HEADER.MajorLinkerVersion
        self.VirtualSize2 = self.pe.sections[1].Misc_VirtualSize
        self.NumberOfSections = self.pe.FILE_HEADER.NumberOfSections
        self.StackReserveSize = self.pe.OPTIONAL_HEADER.SizeOfStackReserve
        self.Dll = self.pe.OPTIONAL_HEADER.DllCharacteristics
        self.pe.parse_data_directories()
        countf = 0
        countm = 0
        for entry in self.pe.DIRECTORY_ENTRY_IMPORT:
          #print (entry.dll)
            countf += 1
            for imp in entry.imports:
                #print ('\t', hex(imp.address), imp.name)
                countm += 1
        self.ImportFunctionCount = countf
        self.ImportFunctionMethodCount = countm
        file_content.close()
        self.md5hash = md5sum(filename)
        print ("Loaded PE File")
        
    def Construct(self):
        sample = {}
        for feature, value in self.__dict__.items():
            if(feature != "pe"):
                sample[feature] = value
        print ("Construct completed")
        return sample
    
def pe2vec():
    #dataset is a python dictionary which store the key value mapping
    dataset = {}
    
    #Recursively search for files within a specified directory and its subdir
    directory = "C:\\Users\\chia0\\Downloads\\dissertation\\malware\\malware\\metasploit_samples\\temp"
    for subdir, dirs, files in os.walk(directory):
        for f in files:
            file_path = os.path.join(subdir, f)
            try:
                #read PE file using PEFILE module
                pe = PEFile(file_path)        
                # pe.construct returns a dictionary with features as key and feature value as value
                dataset[str(f)] = pe.Construct()
            except Exception as e:
                print (e)
    return dataset
    
def saveToCSV(dataset):
    df = pd.DataFrame(dataset)
    infected = df.transpose()  
    infected.to_csv('C:\\Users\\chia0\\Downloads\\dissertation\\malware\\dataset_m641.csv',sep=',', encoding='utf-8')
    
pedata = pe2vec()
saveToCSV(pedata)





No comments:

Post a Comment