|
|
@@ -1,16 +1,17 @@
|
|
|
1
|
+"""Script for searching email messages in a collection of zip files of raw email message files."""
|
|
1
|
2
|
import argparse
|
|
2
|
|
-import re
|
|
|
3
|
+import os
|
|
3
|
4
|
import platform
|
|
|
5
|
+import re
|
|
4
|
6
|
import subprocess
|
|
|
7
|
+import sys
|
|
|
8
|
+from email.message import EmailMessage
|
|
|
9
|
+from email.parser import BytesParser
|
|
|
10
|
+from email.utils import parsedate
|
|
|
11
|
+from enum import Enum
|
|
5
|
12
|
from tempfile import TemporaryDirectory
|
|
6
|
13
|
from typing import List, Optional, Union
|
|
7
|
|
-from enum import Enum
|
|
8
|
|
-from email.utils import parsedate
|
|
9
|
|
-from email.parser import BytesParser
|
|
10
|
|
-from email.message import EmailMessage
|
|
11
|
14
|
from zipfile import ZipFile
|
|
12
|
|
-import sys
|
|
13
|
|
-import os
|
|
14
|
15
|
|
|
15
|
16
|
class BooleanOperator(Enum):
|
|
16
|
17
|
"""Boolean combinatory operator enum."""
|
|
|
@@ -138,7 +139,6 @@ class Options:
|
|
138
|
139
|
self.subject: Optional[str] = None
|
|
139
|
140
|
self.before: Optional[List[int]] = None
|
|
140
|
141
|
self.after: Optional[List[int]] = None
|
|
141
|
|
- self.raw: bool = False
|
|
142
|
142
|
|
|
143
|
143
|
args: Options = Options()
|
|
144
|
144
|
message_filter: Filter = None
|
|
|
@@ -186,11 +186,11 @@ def filename_from_email(email: EmailMessage) -> str:
|
|
186
|
186
|
filename += '.eml'
|
|
187
|
187
|
return filename
|
|
188
|
188
|
|
|
189
|
|
-def walk_directory(path: str) -> None:
|
|
|
189
|
+def walk_directory(directory: str) -> None:
|
|
190
|
190
|
"""Spiders a directory looking for subdirectories and email zip archives."""
|
|
191
|
191
|
global zip_count
|
|
192
|
|
- for f in os.listdir(path):
|
|
193
|
|
- full_path = path + os.sep + f
|
|
|
192
|
+ for f in os.listdir(directory):
|
|
|
193
|
+ full_path = directory + os.sep + f
|
|
194
|
194
|
if f.lower().endswith('.zip'):
|
|
195
|
195
|
zip_count += 1
|
|
196
|
196
|
process_zip_file(full_path)
|
|
|
@@ -199,14 +199,14 @@ def walk_directory(path: str) -> None:
|
|
199
|
199
|
|
|
200
|
200
|
def process_zip_file(zip_path: str) -> None:
|
|
201
|
201
|
"""Processes a zip file of email messages."""
|
|
202
|
|
- global zip_result_count
|
|
|
202
|
+ global parser, zip_result_count
|
|
203
|
203
|
print('Searching ' + zip_path + '...')
|
|
204
|
204
|
zip_result_count = 0
|
|
205
|
|
- with ZipFile(zip_path, mode='r') as zip:
|
|
206
|
|
- for entry in zip.filelist:
|
|
|
205
|
+ with ZipFile(zip_path, mode='r') as z:
|
|
|
206
|
+ for entry in z.filelist:
|
|
207
|
207
|
if entry.is_dir():
|
|
208
|
208
|
continue
|
|
209
|
|
- data = zip.read(entry)
|
|
|
209
|
+ data = z.read(entry)
|
|
210
|
210
|
parser = BytesParser()
|
|
211
|
211
|
try:
|
|
212
|
212
|
email = parser.parsebytes(data)
|
|
|
@@ -221,12 +221,11 @@ def process_zip_file(zip_path: str) -> None:
|
|
221
|
221
|
|
|
222
|
222
|
def search_content(email: EmailMessage) -> None:
|
|
223
|
223
|
"""Processes an email message in a zip file."""
|
|
224
|
|
- global result_count, zip_result_count
|
|
225
|
224
|
if message_filter.matches(email):
|
|
226
|
225
|
save_message(email)
|
|
227
|
226
|
|
|
228
|
227
|
def search_raw_content(raw_bytes: bytes) -> None:
|
|
229
|
|
- global result_count, zip_result_count
|
|
|
228
|
+ """Searches an unparsed email message."""
|
|
230
|
229
|
encodings = [ 'ascii', 'iso-8859-1', 'utf-8' ]
|
|
231
|
230
|
content = None
|
|
232
|
231
|
for encoding in encodings:
|
|
|
@@ -266,14 +265,11 @@ def save_raw_message(content: bytes) -> None:
|
|
266
|
265
|
def parse_arguments():
|
|
267
|
266
|
"""Parses command-line arguments to `args`."""
|
|
268
|
267
|
global args, parser
|
|
269
|
|
- # TODO: Revisit raw mode and how unparseable emails should be handled
|
|
270
|
268
|
parser = argparse.ArgumentParser(
|
|
271
|
269
|
prog='search.py',
|
|
272
|
270
|
description='Searches a directory of zipped email messages. ' + \
|
|
273
|
271
|
'Messages are assumed to be stored one per file within the zip files (Maildir format). ' + \
|
|
274
|
|
- 'Input directories are searched recursively for any zip files contained within.',
|
|
275
|
|
- epilog='If raw mode is enabled, any messages that cannot be decoded ' + \
|
|
276
|
|
- 'will be searched as raw text.'
|
|
|
272
|
+ 'Input directories are searched recursively for any zip files contained within.'
|
|
277
|
273
|
)
|
|
278
|
274
|
parser.add_argument(
|
|
279
|
275
|
'keywords',
|
|
|
@@ -327,17 +323,10 @@ def parse_arguments():
|
|
327
|
323
|
metavar='YYYY-MM-DD',
|
|
328
|
324
|
help='date to search on or before'
|
|
329
|
325
|
)
|
|
330
|
|
- parser.add_argument(
|
|
331
|
|
- '-r', '--raw',
|
|
332
|
|
- default=False,
|
|
333
|
|
- action='store_true',
|
|
334
|
|
- help='allows searching unparseable messages as raw text'
|
|
335
|
|
- )
|
|
336
|
326
|
args = parser.parse_args()
|
|
337
|
327
|
|
|
338
|
328
|
def validate_arguments():
|
|
339
|
329
|
"""Validate and parse special field types"""
|
|
340
|
|
- global args
|
|
341
|
330
|
args.keywords = args.keywords[0] # no idea why it nests it 2D
|
|
342
|
331
|
if args.before is not None:
|
|
343
|
332
|
m = re.match('^([0-9]{4})-([0-9]{2})-([0-9]{2})$', args.before)
|
|
|
@@ -349,13 +338,6 @@ def validate_arguments():
|
|
349
|
338
|
if m is None:
|
|
350
|
339
|
parser.error('after date must be in YYYY-MM-DD format (e.g. 2015-03-28)')
|
|
351
|
340
|
args.after = [ int(m.group(1)), int(m.group(2)), int(m.group(3)) ]
|
|
352
|
|
- if args.raw:
|
|
353
|
|
- if getattr(args, 'from') is not None or \
|
|
354
|
|
- getattr(args, 'to') is not None or \
|
|
355
|
|
- args.subject is not None or \
|
|
356
|
|
- args.before is not None or \
|
|
357
|
|
- args.after is not None:
|
|
358
|
|
- print('Warning: Cannot search header fields in raw mode. Ignoring.', file=sys.stderr)
|
|
359
|
341
|
if args.dir is None:
|
|
360
|
342
|
args.dir = [ '.' ]
|
|
361
|
343
|
else:
|
|
|
@@ -369,6 +351,7 @@ def validate_arguments():
|
|
369
|
351
|
parser.error(f'output path \'{args.output}\' does not exist or is not a directory')
|
|
370
|
352
|
|
|
371
|
353
|
def construct_filter():
|
|
|
354
|
+ """Sets `filter` from parsed command line arguments."""
|
|
372
|
355
|
global message_filter
|
|
373
|
356
|
criteria: List[Filter] = []
|
|
374
|
357
|
keyword_filters = []
|
|
|
@@ -377,7 +360,8 @@ def construct_filter():
|
|
377
|
360
|
if len(k) > 0:
|
|
378
|
361
|
keyword_filters.append(BodyKeywordFilter(k, case_sensitive=args.casesensitive))
|
|
379
|
362
|
if len(keyword_filters) > 0:
|
|
380
|
|
- criteria.append(BooleanFilter(BooleanOperator.or_op if args.any else BooleanOperator.and_op, keyword_filters))
|
|
|
363
|
+ op = BooleanOperator.or_op if args.any else BooleanOperator.and_op
|
|
|
364
|
+ criteria.append(BooleanFilter(op, keyword_filters))
|
|
381
|
365
|
if getattr(args, 'from') is not None:
|
|
382
|
366
|
criteria.append(HeaderFilter('from', getattr(args, 'from')))
|
|
383
|
367
|
if getattr(args, 'to') is not None:
|