|
|
@@ -3,16 +3,17 @@ import re
|
|
3
|
3
|
import platform
|
|
4
|
4
|
import subprocess
|
|
5
|
5
|
from tempfile import TemporaryDirectory
|
|
6
|
|
-from typing import List, Union
|
|
|
6
|
+from typing import List, Optional, Union
|
|
7
|
7
|
from enum import Enum
|
|
8
|
8
|
from email.utils import parsedate
|
|
9
|
9
|
from email.parser import BytesParser
|
|
10
|
10
|
from email.message import EmailMessage
|
|
11
|
|
-from zipfile import ZipFile, ZipInfo
|
|
|
11
|
+from zipfile import ZipFile
|
|
12
|
12
|
import sys
|
|
13
|
13
|
import os
|
|
14
|
14
|
|
|
15
|
15
|
class BooleanOperator(Enum):
|
|
|
16
|
+ """Boolean combinatory operator enum."""
|
|
16
|
17
|
and_op = 1
|
|
17
|
18
|
or_op = 2
|
|
18
|
19
|
|
|
|
@@ -20,10 +21,13 @@ class Filter:
|
|
20
|
21
|
"""Base class for message filters."""
|
|
21
|
22
|
def matches(self, message: EmailMessage) -> bool:
|
|
22
|
23
|
"""Returns true if the given email message matches this filter's criteria."""
|
|
23
|
|
- raise "Not implemented"
|
|
|
24
|
+ raise AssertionError("Not implemented")
|
|
24
|
25
|
def matches_raw(self, raw_content: str) -> bool:
|
|
25
|
26
|
"""Returns true if the given raw, unparsed email content matches this filter's criteria."""
|
|
26
|
|
- raise "Not implemented"
|
|
|
27
|
+ raise AssertionError("Not implemented")
|
|
|
28
|
+ def supports_raw(self) -> bool:
|
|
|
29
|
+ """Whether this filter supports raw messages at least partially."""
|
|
|
30
|
+ return False
|
|
27
|
31
|
|
|
28
|
32
|
class BodyKeywordFilter(Filter):
|
|
29
|
33
|
"""Simple substring search filter."""
|
|
|
@@ -46,11 +50,14 @@ class BodyKeywordFilter(Filter):
|
|
46
|
50
|
if self.case_sensitive:
|
|
47
|
51
|
if self.keyword in raw_content:
|
|
48
|
52
|
return True
|
|
49
|
|
- else:
|
|
50
|
|
- if self.keyword.lower() in raw_content.lower():
|
|
51
|
|
- return True
|
|
|
53
|
+ else:
|
|
|
54
|
+ if self.keyword.lower() in raw_content.lower():
|
|
|
55
|
+ return True
|
|
52
|
56
|
return False
|
|
53
|
57
|
|
|
|
58
|
+ def supports_raw(self) -> bool:
|
|
|
59
|
+ return True
|
|
|
60
|
+
|
|
54
|
61
|
class HeaderFilter(Filter):
|
|
55
|
62
|
"""Matches a value in an email header. Can search one filter or multiple.
|
|
56
|
63
|
Header names case-insensitive; value is case-insensitive."""
|
|
|
@@ -95,11 +102,62 @@ class BooleanFilter(Filter):
|
|
95
|
102
|
return True
|
|
96
|
103
|
return False
|
|
97
|
104
|
|
|
98
|
|
-args: argparse.Namespace = None
|
|
99
|
|
-filter = None
|
|
|
105
|
+ def supports_raw(self) -> bool:
|
|
|
106
|
+ for subfilter in self.subfilters:
|
|
|
107
|
+ if subfilter.supports_raw():
|
|
|
108
|
+ return True
|
|
|
109
|
+ return False
|
|
|
110
|
+
|
|
|
111
|
+class DateFilter(Filter):
|
|
|
112
|
+ """Filters messages based on the date field. For each message with a parseable
|
|
|
113
|
+ date field, the given comparator is called with a `maketime` list representation
|
|
|
114
|
+ of the date and time. The comparator must return a bool of whether to match
|
|
|
115
|
+ the given date or not."""
|
|
|
116
|
+ def __init__(self, comparator):
|
|
|
117
|
+ self.comparator = comparator
|
|
|
118
|
+
|
|
|
119
|
+ def matches(self, message: EmailMessage) -> bool:
|
|
|
120
|
+ date_str = message.get('date', None)
|
|
|
121
|
+ if date_str is None:
|
|
|
122
|
+ return False
|
|
|
123
|
+ date_elems = parsedate(date_str)
|
|
|
124
|
+ if date_elems is None:
|
|
|
125
|
+ return False
|
|
|
126
|
+ return self.comparator(date_elems)
|
|
|
127
|
+
|
|
|
128
|
+class Options:
|
|
|
129
|
+ """Parsed command-line options."""
|
|
|
130
|
+ def __init__(self):
|
|
|
131
|
+ self.keywords: List[str] = []
|
|
|
132
|
+ self.any: bool = False
|
|
|
133
|
+ self.dir: List[str] = []
|
|
|
134
|
+ self.output: Optional[str] = None
|
|
|
135
|
+ self.casesensitive: bool = False
|
|
|
136
|
+ setattr(self, 'from', None)
|
|
|
137
|
+ setattr(self, 'to', None)
|
|
|
138
|
+ self.subject: Optional[str] = None
|
|
|
139
|
+ self.before: Optional[List[int]] = None
|
|
|
140
|
+ self.after: Optional[List[int]] = None
|
|
|
141
|
+ self.raw: bool = False
|
|
|
142
|
+
|
|
|
143
|
+args: Options = Options()
|
|
|
144
|
+message_filter: Filter = None
|
|
100
|
145
|
result_count = 0
|
|
101
|
146
|
zip_result_count = 0
|
|
102
|
147
|
zip_count = 0
|
|
|
148
|
+parser: argparse.ArgumentParser = None
|
|
|
149
|
+
|
|
|
150
|
+def compare_dates(a: List[int], b: List[int]) -> int:
|
|
|
151
|
+ """Compares two list representations of `maketime` date-times. Returns -1 if a < b,
|
|
|
152
|
+ 1 if a > b, and 0 if they are equal."""
|
|
|
153
|
+ for i in range(6):
|
|
|
154
|
+ a_elem = a[i] if i < len(a) else -1
|
|
|
155
|
+ b_elem = b[i] if i < len(b) else -1
|
|
|
156
|
+ if a_elem < b_elem:
|
|
|
157
|
+ return -1
|
|
|
158
|
+ if a_elem > b_elem:
|
|
|
159
|
+ return 1
|
|
|
160
|
+ return 0
|
|
103
|
161
|
|
|
104
|
162
|
def clean_filename(original: str) -> str:
|
|
105
|
163
|
"""Returns a scrubbed string with safe filename characters."""
|
|
|
@@ -108,7 +166,7 @@ def clean_filename(original: str) -> str:
|
|
108
|
166
|
def filename_from_email(email: EmailMessage) -> str:
|
|
109
|
167
|
"""Creates a safe filename to save the given email to."""
|
|
110
|
168
|
filename = ''
|
|
111
|
|
- date_str = email.get('Date', None)
|
|
|
169
|
+ date_str = email.get('date', None)
|
|
112
|
170
|
if date_str is not None:
|
|
113
|
171
|
parsed_date = parsedate(date_str)
|
|
114
|
172
|
if parsed_date is not None:
|
|
|
@@ -119,7 +177,8 @@ def filename_from_email(email: EmailMessage) -> str:
|
|
119
|
177
|
filename += '0000-00-00T00.00.00 - '
|
|
120
|
178
|
else:
|
|
121
|
179
|
filename += '0000-00-00T00.00.00 - '
|
|
122
|
|
- subject = email.get('Subject')
|
|
|
180
|
+ filename += f'{result_count:04} - '
|
|
|
181
|
+ subject = email.get('subject')
|
|
123
|
182
|
if subject is not None:
|
|
124
|
183
|
filename += clean_filename(subject)[0:50].strip()
|
|
125
|
184
|
else:
|
|
|
@@ -127,15 +186,15 @@ def filename_from_email(email: EmailMessage) -> str:
|
|
127
|
186
|
filename += '.eml'
|
|
128
|
187
|
return filename
|
|
129
|
188
|
|
|
130
|
|
-def walk_directory(dir: str) -> None:
|
|
|
189
|
+def walk_directory(path: str) -> None:
|
|
131
|
190
|
"""Spiders a directory looking for subdirectories and email zip archives."""
|
|
132
|
191
|
global zip_count
|
|
133
|
|
- for f in os.listdir(dir):
|
|
134
|
|
- full_path = dir + os.sep + f
|
|
|
192
|
+ for f in os.listdir(path):
|
|
|
193
|
+ full_path = path + os.sep + f
|
|
135
|
194
|
if f.lower().endswith('.zip'):
|
|
136
|
195
|
zip_count += 1
|
|
137
|
196
|
process_zip_file(full_path)
|
|
138
|
|
- if os.path.isdir(f):
|
|
|
197
|
+ if os.path.isdir(full_path):
|
|
139
|
198
|
walk_directory(full_path)
|
|
140
|
199
|
|
|
141
|
200
|
def process_zip_file(zip_path: str) -> None:
|
|
|
@@ -151,64 +210,83 @@ def process_zip_file(zip_path: str) -> None:
|
|
151
|
210
|
parser = BytesParser()
|
|
152
|
211
|
try:
|
|
153
|
212
|
email = parser.parsebytes(data)
|
|
154
|
|
- search_content(email, zip_path, entry)
|
|
155
|
|
- except UnicodeError:
|
|
156
|
|
- print('Unicode error in message. Searching raw content.', file=sys.stderr)
|
|
|
213
|
+ search_content(email)
|
|
157
|
214
|
except:
|
|
158
|
|
- print('Error parsing message. Searching raw content.', file=sys.stderr)
|
|
|
215
|
+ if message_filter.supports_raw():
|
|
|
216
|
+ search_raw_content(data)
|
|
|
217
|
+ else:
|
|
|
218
|
+ print('Message cannot be parsed. Skipping.')
|
|
159
|
219
|
if zip_result_count > 0:
|
|
160
|
220
|
print(f"\t{zip_result_count} results in zip")
|
|
161
|
221
|
|
|
162
|
|
-def search_content(email: EmailMessage, zip_path: str, entry: ZipInfo) -> None:
|
|
|
222
|
+def search_content(email: EmailMessage) -> None:
|
|
163
|
223
|
"""Processes an email message in a zip file."""
|
|
164
|
224
|
global result_count, zip_result_count
|
|
165
|
|
- if filter.matches(email):
|
|
166
|
|
- if not os.path.exists(output_path):
|
|
167
|
|
- os.makedirs(output_path)
|
|
168
|
|
- with open(output_path + os.sep + filename_from_email(email), 'wb') as f:
|
|
169
|
|
- result_count += 1
|
|
170
|
|
- zip_result_count += 1
|
|
171
|
|
- f.write(email.as_bytes())
|
|
|
225
|
+ if message_filter.matches(email):
|
|
|
226
|
+ save_message(email)
|
|
172
|
227
|
|
|
173
|
|
-def search_raw_content(raw_bytes: bytes, zip_path: str, entry: ZipInfo) -> None:
|
|
|
228
|
+def search_raw_content(raw_bytes: bytes) -> None:
|
|
174
|
229
|
global result_count, zip_result_count
|
|
175
|
|
- try:
|
|
176
|
|
- content = raw_bytes.decode('iso-8859-1', errors='ignore')
|
|
177
|
|
- except:
|
|
|
230
|
+ encodings = [ 'ascii', 'iso-8859-1', 'utf-8' ]
|
|
|
231
|
+ content = None
|
|
|
232
|
+ for encoding in encodings:
|
|
178
|
233
|
try:
|
|
179
|
|
- content = raw_bytes.decode('utf-8', errors='ignore')
|
|
|
234
|
+ content = raw_bytes.decode(encoding)
|
|
|
235
|
+ break
|
|
180
|
236
|
except:
|
|
181
|
|
- print('Cannot decode email bytes. Skipping.', file=sys.stderr)
|
|
|
237
|
+ pass
|
|
|
238
|
+ if content is None:
|
|
|
239
|
+ print('Cannot decode email bytes. Skipping message.', file=sys.stderr)
|
|
|
240
|
+ return
|
|
|
241
|
+ print('Could not parse message. Searching raw content.', file=sys.stderr)
|
|
|
242
|
+ if message_filter.matches_raw(content):
|
|
|
243
|
+ save_raw_message(raw_bytes)
|
|
|
244
|
+
|
|
|
245
|
+def save_message(email: EmailMessage) -> None:
|
|
|
246
|
+ """Saves a matching message to the results directory."""
|
|
|
247
|
+ global result_count, zip_result_count
|
|
|
248
|
+ if not os.path.exists(args.output):
|
|
|
249
|
+ os.makedirs(args.output)
|
|
|
250
|
+ with open(args.output + os.sep + filename_from_email(email), 'wb') as f:
|
|
|
251
|
+ result_count += 1
|
|
|
252
|
+ zip_result_count += 1
|
|
|
253
|
+ f.write(email.as_bytes())
|
|
|
254
|
+
|
|
|
255
|
+def save_raw_message(content: bytes) -> None:
|
|
|
256
|
+ """Saves an unparseable matching message to the results directory."""
|
|
|
257
|
+ global result_count, zip_result_count
|
|
|
258
|
+ if not os.path.exists(args.output):
|
|
|
259
|
+ os.makedirs(args.output)
|
|
|
260
|
+ filename = f'unparseable-match-{result_count:04}.eml'
|
|
|
261
|
+ with open(args.output + os.sep + filename, 'wb') as f:
|
|
|
262
|
+ result_count += 1
|
|
|
263
|
+ zip_result_count += 1
|
|
|
264
|
+ f.write(content)
|
|
182
|
265
|
|
|
183
|
266
|
def parse_arguments():
|
|
184
|
267
|
"""Parses command-line arguments to `args`."""
|
|
185
|
|
- global args
|
|
|
268
|
+ global args, parser
|
|
|
269
|
+ # TODO: Revisit raw mode and how unparseable emails should be handled
|
|
186
|
270
|
parser = argparse.ArgumentParser(
|
|
187
|
271
|
prog='search.py',
|
|
188
|
272
|
description='Searches a directory of zipped email messages. ' + \
|
|
189
|
273
|
'Messages are assumed to be stored one per file within the zip files (Maildir format). ' + \
|
|
190
|
274
|
'Input directories are searched recursively for any zip files contained within.',
|
|
191
|
|
- epilog='Raw mode will skip parsing each email message and treat them like simple text files. ' + \
|
|
192
|
|
- 'The headers and body are all searched together without decoding. ' + \
|
|
193
|
|
- 'Arguments for searching individual fields will be ignored. ' + \
|
|
194
|
|
- 'This option exists for messages with encoding errors that prevent them from being ' + \
|
|
195
|
|
- 'parsed correctly. ' + \
|
|
196
|
|
- 'Note that various escaping/encoding schemes commonly used in email messages, such ' + \
|
|
197
|
|
- 'as base64, may cause keywords to not be found despite being in the decoded message ' + \
|
|
198
|
|
- 'because only the raw encoded content is searched. ' + \
|
|
199
|
|
- 'Use this option as a last resort.'
|
|
|
275
|
+ epilog='If raw mode is enabled, any messages that cannot be decoded ' + \
|
|
|
276
|
+ 'will be searched as raw text.'
|
|
200
|
277
|
)
|
|
201
|
278
|
parser.add_argument(
|
|
202
|
279
|
'keywords',
|
|
203
|
280
|
action='append',
|
|
204
|
|
- nargs='+',
|
|
|
281
|
+ nargs='*',
|
|
205
|
282
|
help='one or more phrases to search for in the message body'
|
|
206
|
283
|
)
|
|
207
|
284
|
parser.add_argument(
|
|
208
|
285
|
'--any',
|
|
209
|
286
|
default=False,
|
|
210
|
287
|
action='store_true',
|
|
211
|
|
- help='matches messages containing any of the given search phrases (default requires all phrases appear in a message)'
|
|
|
288
|
+ help='matches messages containing any of the given search phrases (default requires ' + \
|
|
|
289
|
+ 'all phrases appear in a message)'
|
|
212
|
290
|
)
|
|
213
|
291
|
parser.add_argument(
|
|
214
|
292
|
'-d', '--dir',
|
|
|
@@ -227,10 +305,12 @@ def parse_arguments():
|
|
227
|
305
|
)
|
|
228
|
306
|
parser.add_argument(
|
|
229
|
307
|
'-f', '--from',
|
|
|
308
|
+ metavar='sender-email',
|
|
230
|
309
|
help='email address of sender'
|
|
231
|
310
|
)
|
|
232
|
311
|
parser.add_argument(
|
|
233
|
312
|
'-t', '--to',
|
|
|
313
|
+ metavar='recipient-email',
|
|
234
|
314
|
help='email address of recipient (searches to:, cc:, bcc: fields)'
|
|
235
|
315
|
)
|
|
236
|
316
|
parser.add_argument(
|
|
|
@@ -251,10 +331,14 @@ def parse_arguments():
|
|
251
|
331
|
'-r', '--raw',
|
|
252
|
332
|
default=False,
|
|
253
|
333
|
action='store_true',
|
|
254
|
|
- help='searches raw email content (see below)'
|
|
|
334
|
+ help='allows searching unparseable messages as raw text'
|
|
255
|
335
|
)
|
|
256
|
336
|
args = parser.parse_args()
|
|
257
|
337
|
|
|
|
338
|
+def validate_arguments():
|
|
|
339
|
+ """Validate and parse special field types"""
|
|
|
340
|
+ global args
|
|
|
341
|
+ args.keywords = args.keywords[0] # no idea why it nests it 2D
|
|
258
|
342
|
if args.before is not None:
|
|
259
|
343
|
m = re.match('^([0-9]{4})-([0-9]{2})-([0-9]{2})$', args.before)
|
|
260
|
344
|
if m is None:
|
|
|
@@ -285,26 +369,36 @@ def parse_arguments():
|
|
285
|
369
|
parser.error(f'output path \'{args.output}\' does not exist or is not a directory')
|
|
286
|
370
|
|
|
287
|
371
|
def construct_filter():
|
|
288
|
|
- global filter
|
|
|
372
|
+ global message_filter
|
|
289
|
373
|
criteria: List[Filter] = []
|
|
290
|
|
- keyword_filters = map(lambda k : BodyKeywordFilter(k, case_sensitive=args.casesensitive), args.keywords)
|
|
291
|
|
- criteria.append(BooleanFilter(BooleanOperator.or_op if args.any else BooleanOperator.and_op, keyword_filters))
|
|
|
374
|
+ keyword_filters = []
|
|
|
375
|
+ for k in args.keywords:
|
|
|
376
|
+ k = k.strip()
|
|
|
377
|
+ if len(k) > 0:
|
|
|
378
|
+ keyword_filters.append(BodyKeywordFilter(k, case_sensitive=args.casesensitive))
|
|
|
379
|
+ if len(keyword_filters) > 0:
|
|
|
380
|
+ criteria.append(BooleanFilter(BooleanOperator.or_op if args.any else BooleanOperator.and_op, keyword_filters))
|
|
292
|
381
|
if getattr(args, 'from') is not None:
|
|
293
|
382
|
criteria.append(HeaderFilter('from', getattr(args, 'from')))
|
|
294
|
383
|
if getattr(args, 'to') is not None:
|
|
295
|
384
|
criteria.append(HeaderFilter(['to', 'cc', 'bcc'], getattr(args, 'to')))
|
|
296
|
385
|
if args.subject is not None:
|
|
297
|
386
|
criteria.append(HeaderFilter('subject', args.subject))
|
|
298
|
|
- # TODO: Dates
|
|
299
|
|
- filter = BooleanFilter(BooleanOperator.and_op, criteria)
|
|
|
387
|
+ if args.before is not None:
|
|
|
388
|
+ criteria.append(DateFilter(lambda d: compare_dates(d, args.before) <= 0))
|
|
|
389
|
+ if args.after is not None:
|
|
|
390
|
+ criteria.append(DateFilter(lambda d: compare_dates(d, args.after) >= 0))
|
|
|
391
|
+ if len(criteria) == 0:
|
|
|
392
|
+ parser.error('No filters specified')
|
|
|
393
|
+ message_filter = BooleanFilter(BooleanOperator.and_op, criteria)
|
|
300
|
394
|
|
|
301
|
395
|
def handle_results():
|
|
302
|
396
|
"""Final logic after all searching is completed."""
|
|
303
|
397
|
if result_count > 0:
|
|
304
|
398
|
if platform.system() == 'Darwin':
|
|
305
|
|
- subprocess.call(['open', output_path])
|
|
|
399
|
+ subprocess.call(['open', args.output])
|
|
306
|
400
|
elif platform.system() == 'Windows':
|
|
307
|
|
- subprocess.call(['explorer.exe', output_path])
|
|
|
401
|
+ subprocess.call(['explorer.exe', args.output])
|
|
308
|
402
|
print(f'Found {result_count} result(s) total')
|
|
309
|
403
|
elif zip_count == 0:
|
|
310
|
404
|
print('No zip files found')
|
|
|
@@ -315,6 +409,8 @@ def handle_results():
|
|
315
|
409
|
|
|
316
|
410
|
# Main logic
|
|
317
|
411
|
parse_arguments()
|
|
|
412
|
+validate_arguments()
|
|
318
|
413
|
construct_filter()
|
|
319
|
|
-walk_directory(start_path)
|
|
|
414
|
+for path in args.dir:
|
|
|
415
|
+ walk_directory(path)
|
|
320
|
416
|
handle_results()
|