Search utility for my zipped email archives
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. import re
  2. import platform
  3. import subprocess
  4. from tempfile import TemporaryDirectory
  5. from typing import List, Union
  6. from enum import Enum
  7. from email.utils import parsedate
  8. from email.parser import BytesParser
  9. from email.message import EmailMessage
  10. from zipfile import ZipFile, ZipInfo
  11. import sys
  12. import os
  13. class BooleanOperator(Enum):
  14. and_op = 1
  15. or_op = 2
  16. class Filter:
  17. """Base class for message filters."""
  18. def matches(self, message: EmailMessage) -> bool:
  19. raise "Not implemented"
  20. class BodyKeywordFilter(Filter):
  21. """Simple substring search filter."""
  22. def __init__(self, keyword: str, case_sensitive: bool = False):
  23. self.keyword: str = keyword
  24. self.case_sensitive: bool = case_sensitive
  25. def matches(self, message: EmailMessage) -> bool:
  26. for part in message.walk():
  27. if part.get_content_maintype() == 'text':
  28. if self.case_sensitive:
  29. if self.keyword in part.as_string():
  30. return True
  31. else:
  32. if self.keyword.lower() in part.as_string().lower():
  33. return True
  34. return False
  35. class HeaderFilter(Filter):
  36. """Matches a value in an email header. Can search one filter or multiple.
  37. Header names case-insensitive; value is case-insensitive."""
  38. def __init__(self, headers: Union[str, List[str]], value: str):
  39. self.headers: List[str] = [headers] if isinstance(headers, str) else headers
  40. self.value = value
  41. def matches(self, message: EmailMessage) -> bool:
  42. for header in self.headers:
  43. val = message.get(header, None)
  44. if val is None:
  45. continue
  46. if self.value.lower() in val.lower():
  47. return True
  48. return False
  49. class BooleanFilter(Filter):
  50. """Combines other filters with OR/AND logic."""
  51. def __init__(self, operator: BooleanOperator, subfilters: list):
  52. self.operator = operator
  53. self.subfilters: List[Filter] = subfilters
  54. def matches(self, message: EmailMessage) -> bool:
  55. for subfilter in self.subfilters:
  56. result = subfilter.matches(message)
  57. if self.operator == BooleanOperator.and_op and not result:
  58. return False
  59. if self.operator == BooleanOperator.or_op and result:
  60. return True
  61. if self.operator == BooleanOperator.and_op:
  62. return True
  63. return False
  64. start_path = '.'
  65. output_path = TemporaryDirectory(prefix='Email search results (id ', suffix=')').name
  66. filter = None
  67. case_sensitive = False
  68. result_count = 0
  69. zip_result_count = 0
  70. zip_count = 0
  71. def clean_filename(original: str) -> str:
  72. """Returns a scrubbed string with safe filename characters."""
  73. return re.sub(r'[^a-zA-Z0-9 \.!,\(\)\[\]_-]+', '', original)
  74. def filename_from_email(email: EmailMessage) -> str:
  75. """Creates a safe filename to save the given email to."""
  76. filename = ''
  77. date_str = email.get('Date', None)
  78. if date_str is not None:
  79. parsed_date = parsedate(date_str)
  80. if parsed_date is not None:
  81. filename += f'{parsed_date[0]:04}-{parsed_date[1]:02}-{parsed_date[2]:02}' + \
  82. f'T{parsed_date[3]:02}.{parsed_date[4]:02}.{parsed_date[5]:02}' + \
  83. ' - '
  84. else:
  85. filename += '0000-00-00T00.00.00 - '
  86. else:
  87. filename += '0000-00-00T00.00.00 - '
  88. subject = email.get('Subject')
  89. if subject is not None:
  90. filename += clean_filename(subject)[0:50].strip()
  91. else:
  92. filename += '(no subject)'
  93. filename += '.eml'
  94. return filename
  95. def walk_directory(dir: str) -> None:
  96. """Spiders a directory looking for subdirectories and email zip archives."""
  97. global zip_count
  98. for f in os.listdir(dir):
  99. full_path = dir + os.sep + f
  100. if f.lower().endswith('.zip'):
  101. zip_count += 1
  102. process_zip_file(full_path)
  103. if os.path.isdir(f):
  104. walk_directory(full_path)
  105. def process_zip_file(zip_path: str) -> None:
  106. """Processes a zip file of email messages."""
  107. global zip_result_count
  108. print('Searching ' + zip_path + '...')
  109. zip_result_count = 0
  110. with ZipFile(zip_path, mode='r') as zip:
  111. for entry in zip.filelist:
  112. if entry.is_dir():
  113. continue
  114. data = zip.read(entry)
  115. parser = BytesParser()
  116. try:
  117. email = parser.parsebytes(data)
  118. search_content(email, zip_path, entry)
  119. except UnicodeError:
  120. print('Unicode error in message. Skipping.')
  121. except:
  122. print('Error reading message')
  123. if zip_result_count > 0:
  124. print(f"\t{zip_result_count} results in zip")
  125. def search_content(email: EmailMessage, zip_path: str, entry: ZipInfo) -> None:
  126. """Processes an email message in a zip file."""
  127. global result_count, zip_result_count
  128. if filter.matches(email):
  129. if not os.path.exists(output_path):
  130. os.makedirs(output_path)
  131. with open(output_path + os.sep + filename_from_email(email), 'wb') as f:
  132. result_count += 1
  133. zip_result_count += 1
  134. f.write(email.as_bytes())
  135. def parse_arguments():
  136. """Parses the command-line arguments."""
  137. global filter
  138. global start_path
  139. global output_path
  140. global case_sensitive
  141. expect = 'script_name'
  142. for arg in sys.argv:
  143. if arg.startswith('-'):
  144. if arg == '-d':
  145. expect = 'start_path'
  146. elif arg == '-o':
  147. expect = 'output_path'
  148. elif arg == '-c':
  149. case_sensitive = True
  150. else:
  151. raise f'Unknown argument {arg}'
  152. elif expect is not None:
  153. if expect == 'script_name':
  154. expect = None
  155. continue
  156. elif expect == 'start_path':
  157. start_path = arg
  158. expect = None
  159. elif expect == 'output_path':
  160. output_path = arg
  161. expect = None
  162. else:
  163. raise f'Expected other argument {expect}'
  164. else:
  165. if filter is None:
  166. words = arg.split(' ')
  167. word_filters = []
  168. for word in words:
  169. word = word.strip()
  170. if len(word) == 0:
  171. continue
  172. word_filters.append(BodyKeywordFilter(word, case_sensitive))
  173. if len(word_filters) == 0:
  174. continue
  175. filter = BooleanFilter(BooleanOperator.and_op, word_filters)
  176. else:
  177. print('Too many arguments')
  178. sys.exit(4)
  179. def validate_arguments():
  180. if filter is None:
  181. print('No filter specified')
  182. sys.exit(3)
  183. pass
  184. def handle_results():
  185. """Final logic after all searching is completed."""
  186. if result_count > 0:
  187. if platform.system() == 'Darwin':
  188. subprocess.call(['open', output_path])
  189. elif platform.system() == 'Windows':
  190. subprocess.call(['explorer.exe', output_path])
  191. print(f'Found {result_count} result(s) total')
  192. elif zip_count == 0:
  193. print('No zip files found')
  194. sys.exit(2)
  195. else:
  196. print('No results')
  197. sys.exit(1)
  198. # Main logic
  199. parse_arguments()
  200. validate_arguments()
  201. walk_directory(start_path)
  202. handle_results()