I want to add and extract files from an Office/Excel document using Python. So far adding things is easy but for extracting I haven\'t found a clean solution.
To mak
I built a python module to do exactly this check it out over here. https://pypi.org/project/AttachmentsExtractor/ also the module can be run on any os.
after installing the library use the following code snippet Code:
from AttachmentsExtractor import extractor
abs_path_to_file='Please provide absolute path here '
path_to_destination_directory = 'Please provide path of the directory where the extracted attachments should be stored'
extractor.extract(abs_path_to_file,path_to_destination_directory) # returns true if one or more attachments are found else returns false.
Consider using the Windows temp directory that will temporarily store the OLE Object's file source when embedded in workbook. No clipboard is used in this solution but physical files.
With this approach, you will need to retrieve the current user's name and iterate through all files of the temp directory: C:\Documents and Settings\{username}\Local Settings\Temp (standard Excel dump folder for Windows Vista/7/8/10). Also, a conditional like-name search with in
is used that contains original file's basename as multiple versions with number suffixes (1), (2), (3),... may exist depending on how many times script runs. Try even a regex search here.
Finally, the below routine uses try...except...finally
block to cleanly exist the Excel objects regardless of error but will output any exception message. Do note this is only a Windows solution using a text file.
import win32com.client as win32
import os, shutil
from tkinter import messagebox
# (0) Setup
dir_path = cd = os.path.dirname(os.path.abspath(__file__))
print(dir_path)
try:
excel = win32.gencache.EnsureDispatch('Excel.Application')
wb = excel.Workbooks.Open(os.path.join(dir_path, "test_excel.xlsx"))
ws = wb.Worksheets(1)
objs = ws.OLEObjects()
# (1) Embed file
f = os.path.join(dir_path, "test_txt.txt")
name = "test_txt_ole.txt"
objs.Add(Filename=f, IconLabel=name).Name = 'Test'
# (2) Open file from temporary folder
ole = ws.OLEObjects(1)
ole.Activate()
# (3) Grab the recent like-named file
user = os.environ.get('USERNAME')
outfile = os.path.join(dir_path, "test_txt_out.txt")
tempfolder = r"C:\Documents and Settings\{}\Local Settings\Temp".format(user)
for subdir, dirs, files in os.walk(tempfolder):
for file in sorted(files, reverse=True):
if 'test_txt' in file:
tempfile = os.path.join(tempfolder, file)
break
shutil.copyfile(tempfile, outfile)
# (4) Read text content
with open(outfile, 'r') as f:
content = f.readlines()
# (5) Output message with content
messagebox.showinfo(title="test_txt_ole.txt", message="".join(content))
except Exception as e:
print(e)
finally:
wb.Close(True) # CLOSES AND SAVES WORKBOOK
excel.Quit # QUITS EXCEL APP
# RELEASES COM RESOURCES
ws = None; wb = None; objs = None; ole = None; excel = None
Tkinter Messagebox
Well, I find Parfait's solution a bit hackish (in the bad sense) because
So, I wrote an alternative solution. The essence of this is thef following:
unzip the .xlsx file (or any other Office file in the new XML-based format, which is not password protected) to a temporary path.
iterate through all .bin files inside the '/xxx/embeddings' ('xxx' = 'xl' or 'word' or 'ppt'), and create a dictionary that contains the .bin files' temporary paths as keys and the dictionaries returned from step 3 as values.
extract information from the .bin file according to the (not very well documented) Ole Packager format, and return the information as a dictionary. (Retrieves the raw binary data as 'contents', not only from .txt but any file type, e.g. .png)
I'm still learning Python, so this is not perfect (no error checking, no performance optimization) but you can get the idea from it. I tested it on a few examples. Here is my code:
import tempfile
import os
import shutil
import zipfile
import glob
import pythoncom
import win32com.storagecon
def read_zipped_xml_bin_embeddings( path_zipped_xml ):
temp_dir = tempfile.mkdtemp()
zip_file = zipfile.ZipFile( path_zipped_xml )
zip_file.extractall( temp_dir )
zip_file.close()
subdir = {
'.xlsx': 'xl',
'.xlsm': 'xl',
'.xltx': 'xl',
'.xltm': 'xl',
'.docx': 'word',
'.dotx': 'word',
'.docm': 'word',
'.dotm': 'word',
'.pptx': 'ppt',
'.pptm': 'ppt',
'.potx': 'ppt',
'.potm': 'ppt',
}[ os.path.splitext( path_zipped_xml )[ 1 ] ]
embeddings_dir = temp_dir + '\\' + subdir + '\\embeddings\\*.bin'
result = {}
for bin_file in list( glob.glob( embeddings_dir ) ):
result[ bin_file ] = bin_embedding_to_dictionary( bin_file )
shutil.rmtree( temp_dir )
return result
def bin_embedding_to_dictionary( bin_file ):
storage = pythoncom.StgOpenStorage( bin_file, None, win32com.storagecon.STGM_READ | win32com.storagecon.STGM_SHARE_EXCLUSIVE )
for stastg in storage.EnumElements():
if stastg[ 0 ] == '\1Ole10Native':
stream = storage.OpenStream( stastg[ 0 ], None, win32com.storagecon.STGM_READ | win32com.storagecon.STGM_SHARE_EXCLUSIVE )
result = {}
result[ 'original_filename' ] = '' # original filename in ANSI starts at byte 7 and is null terminated
stream.Seek( 6, 0 )
while True:
ch = stream.Read( 1 )
if ch == '\0':
break
result[ 'original_filename' ] += ch
result[ 'original_filepath' ] = '' # original filepath in ANSI is next and is null terminated
while True:
ch = stream.Read( 1 )
if ch == '\0':
break
result[ 'original_filepath' ] += ch
stream.Seek( 4, 1 ) # next 4 bytes is unused
temporary_filepath_size = 0 # size of the temporary file path in ANSI in little endian
temporary_filepath_size |= ord( stream.Read( 1 ) ) << 0
temporary_filepath_size |= ord( stream.Read( 1 ) ) << 8
temporary_filepath_size |= ord( stream.Read( 1 ) ) << 16
temporary_filepath_size |= ord( stream.Read( 1 ) ) << 24
result[ 'temporary_filepath' ] = stream.Read( temporary_filepath_size ) # temporary file path in ANSI
result[ 'size' ] = 0 # size of the contents in little endian
result[ 'size' ] |= ord( stream.Read( 1 ) ) << 0
result[ 'size' ] |= ord( stream.Read( 1 ) ) << 8
result[ 'size' ] |= ord( stream.Read( 1 ) ) << 16
result[ 'size' ] |= ord( stream.Read( 1 ) ) << 24
result[ 'contents' ] = stream.Read( result[ 'size' ] ) # contents
return result
You can use it like this:
objects = read_zipped_xml_bin_embeddings( dir_path + '\\test_excel.xlsx' )
obj = objects.values()[ 0 ] # Get first element, or iterate somehow, the keys are the temporary paths
print( 'Original filename: ' + obj[ 'original_filename' ] )
print( 'Original filepath: ' + obj[ 'original_filepath' ] )
print( 'Original filepath: ' + obj[ 'temporary_filepath' ] )
print( 'Contents: ' + obj[ 'contents' ] )