Search utility for my zipped email archives
Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

search.py 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. import argparse
  2. import re
  3. import platform
  4. import subprocess
  5. from tempfile import TemporaryDirectory
  6. from typing import List, Union
  7. from enum import Enum
  8. from email.utils import parsedate
  9. from email.parser import BytesParser
  10. from email.message import EmailMessage
  11. from zipfile import ZipFile, ZipInfo
  12. import sys
  13. import os
  14. class BooleanOperator(Enum):
  15. and_op = 1
  16. or_op = 2
  17. class Filter:
  18. """Base class for message filters."""
  19. def matches(self, message: EmailMessage) -> bool:
  20. """Returns true if the given email message matches this filter's criteria."""
  21. raise "Not implemented"
  22. def matches_raw(self, raw_content: str) -> bool:
  23. """Returns true if the given raw, unparsed email content matches this filter's criteria."""
  24. raise "Not implemented"
  25. class BodyKeywordFilter(Filter):
  26. """Simple substring search filter."""
  27. def __init__(self, keyword: str, case_sensitive: bool = False):
  28. self.keyword: str = keyword
  29. self.case_sensitive: bool = case_sensitive
  30. def matches(self, message: EmailMessage) -> bool:
  31. for part in message.walk():
  32. if part.get_content_maintype() == 'text':
  33. if self.case_sensitive:
  34. if self.keyword in part.as_string():
  35. return True
  36. else:
  37. if self.keyword.lower() in part.as_string().lower():
  38. return True
  39. return False
  40. def matches_raw(self, raw_content: str) -> bool:
  41. if self.case_sensitive:
  42. if self.keyword in raw_content:
  43. return True
  44. else:
  45. if self.keyword.lower() in raw_content.lower():
  46. return True
  47. return False
  48. class HeaderFilter(Filter):
  49. """Matches a value in an email header. Can search one filter or multiple.
  50. Header names case-insensitive; value is case-insensitive."""
  51. def __init__(self, headers: Union[str, List[str]], value: str):
  52. self.headers: List[str] = [headers] if isinstance(headers, str) else headers
  53. self.value = value
  54. def matches(self, message: EmailMessage) -> bool:
  55. for header in self.headers:
  56. val = message.get(header, None)
  57. if val is None:
  58. continue
  59. if self.value.lower() in val.lower():
  60. return True
  61. return False
  62. class BooleanFilter(Filter):
  63. """Combines other filters with OR/AND logic."""
  64. def __init__(self, operator: BooleanOperator, subfilters: list):
  65. self.operator = operator
  66. self.subfilters: List[Filter] = subfilters
  67. def matches(self, message: EmailMessage) -> bool:
  68. for subfilter in self.subfilters:
  69. result = subfilter.matches(message)
  70. if self.operator == BooleanOperator.and_op and not result:
  71. return False
  72. if self.operator == BooleanOperator.or_op and result:
  73. return True
  74. if self.operator == BooleanOperator.and_op:
  75. return True
  76. return False
  77. def matches_raw(self, raw_content: str) -> bool:
  78. for subfilter in self.subfilters:
  79. result = subfilter.matches_raw(raw_content)
  80. if self.operator == BooleanOperator.and_op and not result:
  81. return False
  82. if self.operator == BooleanOperator.or_op and result:
  83. return True
  84. if self.operator == BooleanOperator.and_op:
  85. return True
  86. return False
  87. args: argparse.Namespace = None
  88. filter = None
  89. result_count = 0
  90. zip_result_count = 0
  91. zip_count = 0
  92. def clean_filename(original: str) -> str:
  93. """Returns a scrubbed string with safe filename characters."""
  94. return re.sub(r'[^a-zA-Z0-9 \.!,\(\)\[\]_-]+', '', original)
  95. def filename_from_email(email: EmailMessage) -> str:
  96. """Creates a safe filename to save the given email to."""
  97. filename = ''
  98. date_str = email.get('Date', None)
  99. if date_str is not None:
  100. parsed_date = parsedate(date_str)
  101. if parsed_date is not None:
  102. filename += f'{parsed_date[0]:04}-{parsed_date[1]:02}-{parsed_date[2]:02}' + \
  103. f'T{parsed_date[3]:02}.{parsed_date[4]:02}.{parsed_date[5]:02}' + \
  104. ' - '
  105. else:
  106. filename += '0000-00-00T00.00.00 - '
  107. else:
  108. filename += '0000-00-00T00.00.00 - '
  109. subject = email.get('Subject')
  110. if subject is not None:
  111. filename += clean_filename(subject)[0:50].strip()
  112. else:
  113. filename += '(no subject)'
  114. filename += '.eml'
  115. return filename
  116. def walk_directory(dir: str) -> None:
  117. """Spiders a directory looking for subdirectories and email zip archives."""
  118. global zip_count
  119. for f in os.listdir(dir):
  120. full_path = dir + os.sep + f
  121. if f.lower().endswith('.zip'):
  122. zip_count += 1
  123. process_zip_file(full_path)
  124. if os.path.isdir(f):
  125. walk_directory(full_path)
  126. def process_zip_file(zip_path: str) -> None:
  127. """Processes a zip file of email messages."""
  128. global zip_result_count
  129. print('Searching ' + zip_path + '...')
  130. zip_result_count = 0
  131. with ZipFile(zip_path, mode='r') as zip:
  132. for entry in zip.filelist:
  133. if entry.is_dir():
  134. continue
  135. data = zip.read(entry)
  136. parser = BytesParser()
  137. try:
  138. email = parser.parsebytes(data)
  139. search_content(email, zip_path, entry)
  140. except UnicodeError:
  141. print('Unicode error in message. Searching raw content.', file=sys.stderr)
  142. except:
  143. print('Error parsing message. Searching raw content.', file=sys.stderr)
  144. if zip_result_count > 0:
  145. print(f"\t{zip_result_count} results in zip")
  146. def search_content(email: EmailMessage, zip_path: str, entry: ZipInfo) -> None:
  147. """Processes an email message in a zip file."""
  148. global result_count, zip_result_count
  149. if filter.matches(email):
  150. if not os.path.exists(output_path):
  151. os.makedirs(output_path)
  152. with open(output_path + os.sep + filename_from_email(email), 'wb') as f:
  153. result_count += 1
  154. zip_result_count += 1
  155. f.write(email.as_bytes())
  156. def search_raw_content(raw_bytes: bytes, zip_path: str, entry: ZipInfo) -> None:
  157. global result_count, zip_result_count
  158. try:
  159. content = raw_bytes.decode('iso-8859-1', errors='ignore')
  160. except:
  161. try:
  162. content = raw_bytes.decode('utf-8', errors='ignore')
  163. except:
  164. print('Cannot decode email bytes. Skipping.', file=sys.stderr)
  165. def parse_arguments():
  166. """Parses command-line arguments to `args`."""
  167. global args
  168. parser = argparse.ArgumentParser(
  169. prog='search.py',
  170. description='Searches a directory of zipped email messages. ' + \
  171. 'Messages are assumed to be stored one per file within the zip files (Maildir format). ' + \
  172. 'Input directories are searched recursively for any zip files contained within.',
  173. epilog='Raw mode will skip parsing each email message and treat them like simple text files. ' + \
  174. 'The headers and body are all searched together without decoding. ' + \
  175. 'Arguments for searching individual fields will be ignored. ' + \
  176. 'This option exists for messages with encoding errors that prevent them from being ' + \
  177. 'parsed correctly. ' + \
  178. 'Note that various escaping/encoding schemes commonly used in email messages, such ' + \
  179. 'as base64, may cause keywords to not be found despite being in the decoded message ' + \
  180. 'because only the raw encoded content is searched. ' + \
  181. 'Use this option as a last resort.'
  182. )
  183. parser.add_argument(
  184. 'keywords',
  185. action='append',
  186. nargs='+',
  187. help='one or more phrases to search for in the message body'
  188. )
  189. parser.add_argument(
  190. '--any',
  191. default=False,
  192. action='store_true',
  193. help='matches messages containing any of the given search phrases (default requires all phrases appear in a message)'
  194. )
  195. parser.add_argument(
  196. '-d', '--dir',
  197. action='append',
  198. help='directory(s) to search for email zip archives (default is working directory)'
  199. )
  200. parser.add_argument(
  201. '-o', '--output',
  202. help='directory to copy matching messages to (default is a temp directory)'
  203. )
  204. parser.add_argument(
  205. '-c', '--casesensitive',
  206. default=False,
  207. action='store_true',
  208. help='search case-sensitively (default is case-insensitive)'
  209. )
  210. parser.add_argument(
  211. '-f', '--from',
  212. help='email address of sender'
  213. )
  214. parser.add_argument(
  215. '-t', '--to',
  216. help='email address of recipient (searches to:, cc:, bcc: fields)'
  217. )
  218. parser.add_argument(
  219. '-s', '--subject',
  220. help='searches subject field'
  221. )
  222. parser.add_argument(
  223. '-a', '--after',
  224. metavar='YYYY-MM-DD',
  225. help='date to search on or after'
  226. )
  227. parser.add_argument(
  228. '-b', '--before',
  229. metavar='YYYY-MM-DD',
  230. help='date to search on or before'
  231. )
  232. parser.add_argument(
  233. '-r', '--raw',
  234. default=False,
  235. action='store_true',
  236. help='searches raw email content (see below)'
  237. )
  238. args = parser.parse_args()
  239. if args.before is not None:
  240. m = re.match('^([0-9]{4})-([0-9]{2})-([0-9]{2})$', args.before)
  241. if m is None:
  242. parser.error('before date must be in YYYY-MM-DD format (e.g. 2015-03-28)')
  243. args.before = [ int(m.group(1)), int(m.group(2)), int(m.group(3)) ]
  244. if args.after is not None:
  245. m = re.match('^([0-9]{4})-([0-9]{2})-([0-9]{2})$', args.after)
  246. if m is None:
  247. parser.error('after date must be in YYYY-MM-DD format (e.g. 2015-03-28)')
  248. args.after = [ int(m.group(1)), int(m.group(2)), int(m.group(3)) ]
  249. if args.raw:
  250. if getattr(args, 'from') is not None or \
  251. getattr(args, 'to') is not None or \
  252. args.subject is not None or \
  253. args.before is not None or \
  254. args.after is not None:
  255. print('Warning: Cannot search header fields in raw mode. Ignoring.', file=sys.stderr)
  256. if args.dir is None:
  257. args.dir = [ '.' ]
  258. else:
  259. for d in args.dir:
  260. if not os.path.exists(d) or not os.path.isdir(d):
  261. parser.error(f'search path \'{d}\' does not exist or is not a directory')
  262. if args.output is None:
  263. args.output = TemporaryDirectory(prefix='Email search results (id ', suffix=')').name
  264. else:
  265. if not os.path.exists(args.output) or not os.path.isdir(args.output):
  266. parser.error(f'output path \'{args.output}\' does not exist or is not a directory')
  267. def construct_filter():
  268. global filter
  269. criteria: List[Filter] = []
  270. keyword_filters = map(lambda k : BodyKeywordFilter(k, case_sensitive=args.casesensitive), args.keywords)
  271. criteria.append(BooleanFilter(BooleanOperator.or_op if args.any else BooleanOperator.and_op, keyword_filters))
  272. if getattr(args, 'from') is not None:
  273. criteria.append(HeaderFilter('from', getattr(args, 'from')))
  274. if getattr(args, 'to') is not None:
  275. criteria.append(HeaderFilter(['to', 'cc', 'bcc'], getattr(args, 'to')))
  276. if args.subject is not None:
  277. criteria.append(HeaderFilter('subject', args.subject))
  278. # TODO: Dates
  279. filter = BooleanFilter(BooleanOperator.and_op, criteria)
  280. def handle_results():
  281. """Final logic after all searching is completed."""
  282. if result_count > 0:
  283. if platform.system() == 'Darwin':
  284. subprocess.call(['open', output_path])
  285. elif platform.system() == 'Windows':
  286. subprocess.call(['explorer.exe', output_path])
  287. print(f'Found {result_count} result(s) total')
  288. elif zip_count == 0:
  289. print('No zip files found')
  290. sys.exit(2)
  291. else:
  292. print('No results')
  293. sys.exit(1)
  294. # Main logic
  295. parse_arguments()
  296. construct_filter()
  297. walk_directory(start_path)
  298. handle_results()