Search utility for my zipped email archives
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

search.py 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. import argparse
  2. import re
  3. import platform
  4. import subprocess
  5. from tempfile import TemporaryDirectory
  6. from typing import List, Optional, Union
  7. from enum import Enum
  8. from email.utils import parsedate
  9. from email.parser import BytesParser
  10. from email.message import EmailMessage
  11. from zipfile import ZipFile
  12. import sys
  13. import os
  14. class BooleanOperator(Enum):
  15. """Boolean combinatory operator enum."""
  16. and_op = 1
  17. or_op = 2
  18. class Filter:
  19. """Base class for message filters."""
  20. def matches(self, message: EmailMessage) -> bool:
  21. """Returns true if the given email message matches this filter's criteria."""
  22. raise AssertionError("Not implemented")
  23. def matches_raw(self, raw_content: str) -> bool:
  24. """Returns true if the given raw, unparsed email content matches this filter's criteria."""
  25. raise AssertionError("Not implemented")
  26. def supports_raw(self) -> bool:
  27. """Whether this filter supports raw messages at least partially."""
  28. return False
  29. class BodyKeywordFilter(Filter):
  30. """Simple substring search filter."""
  31. def __init__(self, keyword: str, case_sensitive: bool = False):
  32. self.keyword: str = keyword
  33. self.case_sensitive: bool = case_sensitive
  34. def matches(self, message: EmailMessage) -> bool:
  35. for part in message.walk():
  36. if part.get_content_maintype() == 'text':
  37. if self.case_sensitive:
  38. if self.keyword in part.as_string():
  39. return True
  40. else:
  41. if self.keyword.lower() in part.as_string().lower():
  42. return True
  43. return False
  44. def matches_raw(self, raw_content: str) -> bool:
  45. if self.case_sensitive:
  46. if self.keyword in raw_content:
  47. return True
  48. else:
  49. if self.keyword.lower() in raw_content.lower():
  50. return True
  51. return False
  52. def supports_raw(self) -> bool:
  53. return True
  54. class HeaderFilter(Filter):
  55. """Matches a value in an email header. Can search one filter or multiple.
  56. Header names case-insensitive; value is case-insensitive."""
  57. def __init__(self, headers: Union[str, List[str]], value: str):
  58. self.headers: List[str] = [headers] if isinstance(headers, str) else headers
  59. self.value = value
  60. def matches(self, message: EmailMessage) -> bool:
  61. for header in self.headers:
  62. val = message.get(header, None)
  63. if val is None:
  64. continue
  65. if self.value.lower() in val.lower():
  66. return True
  67. return False
  68. class BooleanFilter(Filter):
  69. """Combines other filters with OR/AND logic."""
  70. def __init__(self, operator: BooleanOperator, subfilters: list):
  71. self.operator = operator
  72. self.subfilters: List[Filter] = subfilters
  73. def matches(self, message: EmailMessage) -> bool:
  74. for subfilter in self.subfilters:
  75. result = subfilter.matches(message)
  76. if self.operator == BooleanOperator.and_op and not result:
  77. return False
  78. if self.operator == BooleanOperator.or_op and result:
  79. return True
  80. if self.operator == BooleanOperator.and_op:
  81. return True
  82. return False
  83. def matches_raw(self, raw_content: str) -> bool:
  84. for subfilter in self.subfilters:
  85. result = subfilter.matches_raw(raw_content)
  86. if self.operator == BooleanOperator.and_op and not result:
  87. return False
  88. if self.operator == BooleanOperator.or_op and result:
  89. return True
  90. if self.operator == BooleanOperator.and_op:
  91. return True
  92. return False
  93. def supports_raw(self) -> bool:
  94. for subfilter in self.subfilters:
  95. if subfilter.supports_raw():
  96. return True
  97. return False
  98. class DateFilter(Filter):
  99. """Filters messages based on the date field. For each message with a parseable
  100. date field, the given comparator is called with a `maketime` list representation
  101. of the date and time. The comparator must return a bool of whether to match
  102. the given date or not."""
  103. def __init__(self, comparator):
  104. self.comparator = comparator
  105. def matches(self, message: EmailMessage) -> bool:
  106. date_str = message.get('date', None)
  107. if date_str is None:
  108. return False
  109. date_elems = parsedate(date_str)
  110. if date_elems is None:
  111. return False
  112. return self.comparator(date_elems)
  113. class Options:
  114. """Parsed command-line options."""
  115. def __init__(self):
  116. self.keywords: List[str] = []
  117. self.any: bool = False
  118. self.dir: List[str] = []
  119. self.output: Optional[str] = None
  120. self.casesensitive: bool = False
  121. setattr(self, 'from', None)
  122. setattr(self, 'to', None)
  123. self.subject: Optional[str] = None
  124. self.before: Optional[List[int]] = None
  125. self.after: Optional[List[int]] = None
  126. self.raw: bool = False
  127. args: Options = Options()
  128. message_filter: Filter = None
  129. result_count = 0
  130. zip_result_count = 0
  131. zip_count = 0
  132. parser: argparse.ArgumentParser = None
  133. def compare_dates(a: List[int], b: List[int]) -> int:
  134. """Compares two list representations of `maketime` date-times. Returns -1 if a < b,
  135. 1 if a > b, and 0 if they are equal."""
  136. for i in range(6):
  137. a_elem = a[i] if i < len(a) else -1
  138. b_elem = b[i] if i < len(b) else -1
  139. if a_elem < b_elem:
  140. return -1
  141. if a_elem > b_elem:
  142. return 1
  143. return 0
  144. def clean_filename(original: str) -> str:
  145. """Returns a scrubbed string with safe filename characters."""
  146. return re.sub(r'[^a-zA-Z0-9 \.!,\(\)\[\]_-]+', '', original)
  147. def filename_from_email(email: EmailMessage) -> str:
  148. """Creates a safe filename to save the given email to."""
  149. filename = ''
  150. date_str = email.get('date', None)
  151. if date_str is not None:
  152. parsed_date = parsedate(date_str)
  153. if parsed_date is not None:
  154. filename += f'{parsed_date[0]:04}-{parsed_date[1]:02}-{parsed_date[2]:02}' + \
  155. f'T{parsed_date[3]:02}.{parsed_date[4]:02}.{parsed_date[5]:02}' + \
  156. ' - '
  157. else:
  158. filename += '0000-00-00T00.00.00 - '
  159. else:
  160. filename += '0000-00-00T00.00.00 - '
  161. filename += f'{result_count:04} - '
  162. subject = email.get('subject')
  163. if subject is not None:
  164. filename += clean_filename(subject)[0:50].strip()
  165. else:
  166. filename += '(no subject)'
  167. filename += '.eml'
  168. return filename
  169. def walk_directory(path: str) -> None:
  170. """Spiders a directory looking for subdirectories and email zip archives."""
  171. global zip_count
  172. for f in os.listdir(path):
  173. full_path = path + os.sep + f
  174. if f.lower().endswith('.zip'):
  175. zip_count += 1
  176. process_zip_file(full_path)
  177. if os.path.isdir(full_path):
  178. walk_directory(full_path)
  179. def process_zip_file(zip_path: str) -> None:
  180. """Processes a zip file of email messages."""
  181. global zip_result_count
  182. print('Searching ' + zip_path + '...')
  183. zip_result_count = 0
  184. with ZipFile(zip_path, mode='r') as zip:
  185. for entry in zip.filelist:
  186. if entry.is_dir():
  187. continue
  188. data = zip.read(entry)
  189. parser = BytesParser()
  190. try:
  191. email = parser.parsebytes(data)
  192. search_content(email)
  193. except:
  194. if message_filter.supports_raw():
  195. search_raw_content(data)
  196. else:
  197. print('Message cannot be parsed. Skipping.')
  198. if zip_result_count > 0:
  199. print(f"\t{zip_result_count} results in zip")
  200. def search_content(email: EmailMessage) -> None:
  201. """Processes an email message in a zip file."""
  202. global result_count, zip_result_count
  203. if message_filter.matches(email):
  204. save_message(email)
  205. def search_raw_content(raw_bytes: bytes) -> None:
  206. global result_count, zip_result_count
  207. encodings = [ 'ascii', 'iso-8859-1', 'utf-8' ]
  208. content = None
  209. for encoding in encodings:
  210. try:
  211. content = raw_bytes.decode(encoding)
  212. break
  213. except:
  214. pass
  215. if content is None:
  216. print('Cannot decode email bytes. Skipping message.', file=sys.stderr)
  217. return
  218. print('Could not parse message. Searching raw content.', file=sys.stderr)
  219. if message_filter.matches_raw(content):
  220. save_raw_message(raw_bytes)
  221. def save_message(email: EmailMessage) -> None:
  222. """Saves a matching message to the results directory."""
  223. global result_count, zip_result_count
  224. if not os.path.exists(args.output):
  225. os.makedirs(args.output)
  226. with open(args.output + os.sep + filename_from_email(email), 'wb') as f:
  227. result_count += 1
  228. zip_result_count += 1
  229. f.write(email.as_bytes())
  230. def save_raw_message(content: bytes) -> None:
  231. """Saves an unparseable matching message to the results directory."""
  232. global result_count, zip_result_count
  233. if not os.path.exists(args.output):
  234. os.makedirs(args.output)
  235. filename = f'unparseable-match-{result_count:04}.eml'
  236. with open(args.output + os.sep + filename, 'wb') as f:
  237. result_count += 1
  238. zip_result_count += 1
  239. f.write(content)
  240. def parse_arguments():
  241. """Parses command-line arguments to `args`."""
  242. global args, parser
  243. # TODO: Revisit raw mode and how unparseable emails should be handled
  244. parser = argparse.ArgumentParser(
  245. prog='search.py',
  246. description='Searches a directory of zipped email messages. ' + \
  247. 'Messages are assumed to be stored one per file within the zip files (Maildir format). ' + \
  248. 'Input directories are searched recursively for any zip files contained within.',
  249. epilog='If raw mode is enabled, any messages that cannot be decoded ' + \
  250. 'will be searched as raw text.'
  251. )
  252. parser.add_argument(
  253. 'keywords',
  254. action='append',
  255. nargs='*',
  256. help='one or more phrases to search for in the message body'
  257. )
  258. parser.add_argument(
  259. '--any',
  260. default=False,
  261. action='store_true',
  262. help='matches messages containing any of the given search phrases (default requires ' + \
  263. 'all phrases appear in a message)'
  264. )
  265. parser.add_argument(
  266. '-d', '--dir',
  267. action='append',
  268. help='directory(s) to search for email zip archives (default is working directory)'
  269. )
  270. parser.add_argument(
  271. '-o', '--output',
  272. help='directory to copy matching messages to (default is a temp directory)'
  273. )
  274. parser.add_argument(
  275. '-c', '--casesensitive',
  276. default=False,
  277. action='store_true',
  278. help='search case-sensitively (default is case-insensitive)'
  279. )
  280. parser.add_argument(
  281. '-f', '--from',
  282. metavar='sender-email',
  283. help='email address of sender'
  284. )
  285. parser.add_argument(
  286. '-t', '--to',
  287. metavar='recipient-email',
  288. help='email address of recipient (searches to:, cc:, bcc: fields)'
  289. )
  290. parser.add_argument(
  291. '-s', '--subject',
  292. help='searches subject field'
  293. )
  294. parser.add_argument(
  295. '-a', '--after',
  296. metavar='YYYY-MM-DD',
  297. help='date to search on or after'
  298. )
  299. parser.add_argument(
  300. '-b', '--before',
  301. metavar='YYYY-MM-DD',
  302. help='date to search on or before'
  303. )
  304. parser.add_argument(
  305. '-r', '--raw',
  306. default=False,
  307. action='store_true',
  308. help='allows searching unparseable messages as raw text'
  309. )
  310. args = parser.parse_args()
  311. def validate_arguments():
  312. """Validate and parse special field types"""
  313. global args
  314. args.keywords = args.keywords[0] # no idea why it nests it 2D
  315. if args.before is not None:
  316. m = re.match('^([0-9]{4})-([0-9]{2})-([0-9]{2})$', args.before)
  317. if m is None:
  318. parser.error('before date must be in YYYY-MM-DD format (e.g. 2015-03-28)')
  319. args.before = [ int(m.group(1)), int(m.group(2)), int(m.group(3)) ]
  320. if args.after is not None:
  321. m = re.match('^([0-9]{4})-([0-9]{2})-([0-9]{2})$', args.after)
  322. if m is None:
  323. parser.error('after date must be in YYYY-MM-DD format (e.g. 2015-03-28)')
  324. args.after = [ int(m.group(1)), int(m.group(2)), int(m.group(3)) ]
  325. if args.raw:
  326. if getattr(args, 'from') is not None or \
  327. getattr(args, 'to') is not None or \
  328. args.subject is not None or \
  329. args.before is not None or \
  330. args.after is not None:
  331. print('Warning: Cannot search header fields in raw mode. Ignoring.', file=sys.stderr)
  332. if args.dir is None:
  333. args.dir = [ '.' ]
  334. else:
  335. for d in args.dir:
  336. if not os.path.exists(d) or not os.path.isdir(d):
  337. parser.error(f'search path \'{d}\' does not exist or is not a directory')
  338. if args.output is None:
  339. args.output = TemporaryDirectory(prefix='Email search results (id ', suffix=')').name
  340. else:
  341. if not os.path.exists(args.output) or not os.path.isdir(args.output):
  342. parser.error(f'output path \'{args.output}\' does not exist or is not a directory')
  343. def construct_filter():
  344. global message_filter
  345. criteria: List[Filter] = []
  346. keyword_filters = []
  347. for k in args.keywords:
  348. k = k.strip()
  349. if len(k) > 0:
  350. keyword_filters.append(BodyKeywordFilter(k, case_sensitive=args.casesensitive))
  351. if len(keyword_filters) > 0:
  352. criteria.append(BooleanFilter(BooleanOperator.or_op if args.any else BooleanOperator.and_op, keyword_filters))
  353. if getattr(args, 'from') is not None:
  354. criteria.append(HeaderFilter('from', getattr(args, 'from')))
  355. if getattr(args, 'to') is not None:
  356. criteria.append(HeaderFilter(['to', 'cc', 'bcc'], getattr(args, 'to')))
  357. if args.subject is not None:
  358. criteria.append(HeaderFilter('subject', args.subject))
  359. if args.before is not None:
  360. criteria.append(DateFilter(lambda d: compare_dates(d, args.before) <= 0))
  361. if args.after is not None:
  362. criteria.append(DateFilter(lambda d: compare_dates(d, args.after) >= 0))
  363. if len(criteria) == 0:
  364. parser.error('No filters specified')
  365. message_filter = BooleanFilter(BooleanOperator.and_op, criteria)
  366. def handle_results():
  367. """Final logic after all searching is completed."""
  368. if result_count > 0:
  369. if platform.system() == 'Darwin':
  370. subprocess.call(['open', args.output])
  371. elif platform.system() == 'Windows':
  372. subprocess.call(['explorer.exe', args.output])
  373. print(f'Found {result_count} result(s) total')
  374. elif zip_count == 0:
  375. print('No zip files found')
  376. sys.exit(2)
  377. else:
  378. print('No results')
  379. sys.exit(1)
  380. # Main logic
  381. parse_arguments()
  382. validate_arguments()
  383. construct_filter()
  384. for path in args.dir:
  385. walk_directory(path)
  386. handle_results()