Showing posts with label feature extraction. Show all posts
Showing posts with label feature extraction. Show all posts

Wednesday, August 29, 2018

Feature extraction of PEFiles using Python

This post serves as a reminder on how to perform feature extraction of PE binaries using Python.
The code uses the 2012 Adobe's Malware Classifier as a reference and modified accordingly for my own purpose. 
The code uses PEFile python module to read and work with PE files. 
 
My Purpose:
a) Extraction of PE file Header and Section information.
b) Compute MD5 sum of binary.
c) Save the information in a CSV file.

First import the required python library. 
import os
import pefile
import pandas as pd
import hashlib

Next define the function for computing md5 value given the filename. *Filename requires full path if binary is located in a separate directory from the python script.
def md5sum(filename, blocksize=65536):
    hash = hashlib.md5()
    with open(filename, "rb") as f:
        for block in iter(lambda: f.read(blocksize), b""):
            hash.update(block)
    return hash.hexdigest()

Next define a python class to represent the PE binary file.
class PEFile:
    #representation of PE file
    def __init__(self, filename):
        with open(filename, "rb") as file_content:
            self.pe= pefile.PE(data=file_content.read(), fast_load=True)
        self.filename = filename
        self.DebugSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size
        self.DebugRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress
        self.ImageVersion = self.pe.OPTIONAL_HEADER.MajorImageVersion
        self.OSVersion = self.pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
        self.ExportRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress
        self.ExportSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size
        self.IATRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress
        self.ResSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size
        self.LinkerVersion = self.pe.OPTIONAL_HEADER.MajorLinkerVersion
        self.VirtualSize2 = self.pe.sections[1].Misc_VirtualSize
        self.NumberOfSections = self.pe.FILE_HEADER.NumberOfSections
        self.StackReserveSize = self.pe.OPTIONAL_HEADER.SizeOfStackReserve
        self.Dll = self.pe.OPTIONAL_HEADER.DllCharacteristics
        self.pe.parse_data_directories()
        countf = 0
        countm = 0
        for entry in self.pe.DIRECTORY_ENTRY_IMPORT:
          #print (entry.dll)
            countf += 1
            for imp in entry.imports:
                #print ('\t', hex(imp.address), imp.name)
                countm += 1
        self.ImportFunctionCount = countf
        self.ImportFunctionMethodCount = countm
        file_content.close()
        self.md5hash = md5sum(filename)
        print ("Loaded PE File")
        
    def Construct(self):
        sample = {}
        for feature, value in self.__dict__.items():
            if(feature != "pe"):
                sample[feature] = value
        print ("Construct completed")
        return sample

Method to iterate through a directory for PE Files. PE Header and Section information are extracted and stored in a python dictionary and returned.
def pe2vec():
    #dataset is a python dictionary which store the key value mapping
    dataset = {}
    
    #Recursively search for files within a specified directory and its subdir
    directory = "C:\\Users\\chia0\\Downloads\\dissertation\\malware\\malware\\metasploit_samples\\temp"
    for subdir, dirs, files in os.walk(directory):
        for f in files:
            file_path = os.path.join(subdir, f)
            try:
                #read PE file using PEFILE module
                pe = PEFile(file_path)        
                # pe.construct returns a dictionary with features as key and feature value as value
                dataset[str(f)] = pe.Construct()
            except Exception as e:
                print (e)
    return dataset

Method to save python dictionary returned by pe2vec() function to a CSV file.
def saveToCSV(dataset):
    df = pd.DataFrame(dataset)
    infected = df.transpose()  
    infected.to_csv('C:\\Users\\chia0\\Downloads\\dissertation\\malware\\dataset_m641.csv',sep=',', encoding='utf-8')
    


Full Code is shown below:
import os
import pefile
import pandas as pd
import hashlib

def md5sum(filename, blocksize=65536):
    hash = hashlib.md5()
    with open(filename, "rb") as f:
        for block in iter(lambda: f.read(blocksize), b""):
            hash.update(block)
    return hash.hexdigest()

class PEFile:
    #representation of PE file
    def __init__(self, filename):
        with open(filename, "rb") as file_content:
            self.pe= pefile.PE(data=file_content.read(), fast_load=True)
        self.filename = filename
        self.DebugSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size
        self.DebugRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress
        self.ImageVersion = self.pe.OPTIONAL_HEADER.MajorImageVersion
        self.OSVersion = self.pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
        self.ExportRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress
        self.ExportSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size
        self.IATRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress
        self.ResSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size
        self.LinkerVersion = self.pe.OPTIONAL_HEADER.MajorLinkerVersion
        self.VirtualSize2 = self.pe.sections[1].Misc_VirtualSize
        self.NumberOfSections = self.pe.FILE_HEADER.NumberOfSections
        self.StackReserveSize = self.pe.OPTIONAL_HEADER.SizeOfStackReserve
        self.Dll = self.pe.OPTIONAL_HEADER.DllCharacteristics
        self.pe.parse_data_directories()
        countf = 0
        countm = 0
        for entry in self.pe.DIRECTORY_ENTRY_IMPORT:
          #print (entry.dll)
            countf += 1
            for imp in entry.imports:
                #print ('\t', hex(imp.address), imp.name)
                countm += 1
        self.ImportFunctionCount = countf
        self.ImportFunctionMethodCount = countm
        file_content.close()
        self.md5hash = md5sum(filename)
        print ("Loaded PE File")
        
    def Construct(self):
        sample = {}
        for feature, value in self.__dict__.items():
            if(feature != "pe"):
                sample[feature] = value
        print ("Construct completed")
        return sample
    
def pe2vec():
    #dataset is a python dictionary which store the key value mapping
    dataset = {}
    
    #Recursively search for files within a specified directory and its subdir
    directory = "C:\\Users\\chia0\\Downloads\\dissertation\\malware\\malware\\metasploit_samples\\temp"
    for subdir, dirs, files in os.walk(directory):
        for f in files:
            file_path = os.path.join(subdir, f)
            try:
                #read PE file using PEFILE module
                pe = PEFile(file_path)        
                # pe.construct returns a dictionary with features as key and feature value as value
                dataset[str(f)] = pe.Construct()
            except Exception as e:
                print (e)
    return dataset
    
def saveToCSV(dataset):
    df = pd.DataFrame(dataset)
    infected = df.transpose()  
    infected.to_csv('C:\\Users\\chia0\\Downloads\\dissertation\\malware\\dataset_m641.csv',sep=',', encoding='utf-8')
    
pedata = pe2vec()
saveToCSV(pedata)