| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220 |
- import re
- import platform
- import subprocess
- from tempfile import TemporaryDirectory
- from typing import List, Union
- from enum import Enum
- from email.utils import parsedate
- from email.parser import BytesParser
- from email.message import EmailMessage
- from zipfile import ZipFile, ZipInfo
- import sys
- import os
-
- class BooleanOperator(Enum):
- and_op = 1
- or_op = 2
-
- class Filter:
- """Base class for message filters."""
- def matches(self, message: EmailMessage) -> bool:
- raise "Not implemented"
-
- class BodyKeywordFilter(Filter):
- """Simple substring search filter."""
- def __init__(self, keyword: str, case_sensitive: bool = False):
- self.keyword: str = keyword
- self.case_sensitive: bool = case_sensitive
-
- def matches(self, message: EmailMessage) -> bool:
- for part in message.walk():
- if part.get_content_maintype() == 'text':
- if self.case_sensitive:
- if self.keyword in part.as_string():
- return True
- else:
- if self.keyword.lower() in part.as_string().lower():
- return True
- return False
-
- class HeaderFilter(Filter):
- """Matches a value in an email header. Can search one filter or multiple.
- Header names case-insensitive; value is case-insensitive."""
- def __init__(self, headers: Union[str, List[str]], value: str):
- self.headers: List[str] = [headers] if isinstance(headers, str) else headers
- self.value = value
-
- def matches(self, message: EmailMessage) -> bool:
- for header in self.headers:
- val = message.get(header, None)
- if val is None:
- continue
- if self.value.lower() in val.lower():
- return True
- return False
-
- class BooleanFilter(Filter):
- """Combines other filters with OR/AND logic."""
- def __init__(self, operator: BooleanOperator, subfilters: list):
- self.operator = operator
- self.subfilters: List[Filter] = subfilters
-
- def matches(self, message: EmailMessage) -> bool:
- for subfilter in self.subfilters:
- result = subfilter.matches(message)
- if self.operator == BooleanOperator.and_op and not result:
- return False
- if self.operator == BooleanOperator.or_op and result:
- return True
- if self.operator == BooleanOperator.and_op:
- return True
- return False
-
- start_path = '.'
- output_path = TemporaryDirectory(prefix='Email search results (id ', suffix=')').name
- filter = None
- case_sensitive = False
- result_count = 0
- zip_result_count = 0
- zip_count = 0
-
- def clean_filename(original: str) -> str:
- """Returns a scrubbed string with safe filename characters."""
- return re.sub(r'[^a-zA-Z0-9 \.!,\(\)\[\]_-]+', '', original)
-
- def filename_from_email(email: EmailMessage) -> str:
- """Creates a safe filename to save the given email to."""
- filename = ''
- date_str = email.get('Date', None)
- if date_str is not None:
- parsed_date = parsedate(date_str)
- if parsed_date is not None:
- filename += f'{parsed_date[0]:04}-{parsed_date[1]:02}-{parsed_date[2]:02}' + \
- f'T{parsed_date[3]:02}.{parsed_date[4]:02}.{parsed_date[5]:02}' + \
- ' - '
- else:
- filename += '0000-00-00T00.00.00 - '
- else:
- filename += '0000-00-00T00.00.00 - '
- subject = email.get('Subject')
- if subject is not None:
- filename += clean_filename(subject)[0:50].strip()
- else:
- filename += '(no subject)'
- filename += '.eml'
- return filename
-
- def walk_directory(dir: str) -> None:
- """Spiders a directory looking for subdirectories and email zip archives."""
- global zip_count
- for f in os.listdir(dir):
- full_path = dir + os.sep + f
- if f.lower().endswith('.zip'):
- zip_count += 1
- process_zip_file(full_path)
- if os.path.isdir(f):
- walk_directory(full_path)
-
- def process_zip_file(zip_path: str) -> None:
- """Processes a zip file of email messages."""
- global zip_result_count
- print('Searching ' + zip_path + '...')
- zip_result_count = 0
- with ZipFile(zip_path, mode='r') as zip:
- for entry in zip.filelist:
- if entry.is_dir():
- continue
- data = zip.read(entry)
- parser = BytesParser()
- try:
- email = parser.parsebytes(data)
- search_content(email, zip_path, entry)
- except UnicodeError:
- print('Unicode error in message. Skipping.')
- except:
- print('Error reading message')
- if zip_result_count > 0:
- print(f"\t{zip_result_count} results in zip")
-
- def search_content(email: EmailMessage, zip_path: str, entry: ZipInfo) -> None:
- """Processes an email message in a zip file."""
- global result_count, zip_result_count
- if filter.matches(email):
- if not os.path.exists(output_path):
- os.makedirs(output_path)
- with open(output_path + os.sep + filename_from_email(email), 'wb') as f:
- result_count += 1
- zip_result_count += 1
- f.write(email.as_bytes())
-
- def parse_arguments():
- """Parses the command-line arguments."""
- global filter
- global start_path
- global output_path
- global case_sensitive
- expect = 'script_name'
- for arg in sys.argv:
- if arg.startswith('-'):
- if arg == '-d':
- expect = 'start_path'
- elif arg == '-o':
- expect = 'output_path'
- elif arg == '-c':
- case_sensitive = True
- else:
- raise f'Unknown argument {arg}'
- elif expect is not None:
- if expect == 'script_name':
- expect = None
- continue
- elif expect == 'start_path':
- start_path = arg
- expect = None
- elif expect == 'output_path':
- output_path = arg
- expect = None
- else:
- raise f'Expected other argument {expect}'
- else:
- if filter is None:
- words = arg.split(' ')
- word_filters = []
- for word in words:
- word = word.strip()
- if len(word) == 0:
- continue
- word_filters.append(BodyKeywordFilter(word, case_sensitive))
- if len(word_filters) == 0:
- continue
- filter = BooleanFilter(BooleanOperator.and_op, word_filters)
- else:
- print('Too many arguments')
- sys.exit(4)
-
- def validate_arguments():
- if filter is None:
- print('No filter specified')
- sys.exit(3)
- pass
-
- def handle_results():
- """Final logic after all searching is completed."""
- if result_count > 0:
- if platform.system() == 'Darwin':
- subprocess.call(['open', output_path])
- elif platform.system() == 'Windows':
- subprocess.call(['explorer.exe', output_path])
- print(f'Found {result_count} result(s) total')
- elif zip_count == 0:
- print('No zip files found')
- sys.exit(2)
- else:
- print('No results')
- sys.exit(1)
-
- # Main logic
- parse_arguments()
- validate_arguments()
- walk_directory(start_path)
- handle_results()
|