|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+import argparse
|
|
1
|
2
|
import re
|
|
2
|
3
|
import platform
|
|
3
|
4
|
import subprocess
|
|
|
@@ -18,6 +19,10 @@ class BooleanOperator(Enum):
|
|
18
|
19
|
class Filter:
|
|
19
|
20
|
"""Base class for message filters."""
|
|
20
|
21
|
def matches(self, message: EmailMessage) -> bool:
|
|
|
22
|
+ """Returns true if the given email message matches this filter's criteria."""
|
|
|
23
|
+ raise "Not implemented"
|
|
|
24
|
+ def matches_raw(self, raw_content: str) -> bool:
|
|
|
25
|
+ """Returns true if the given raw, unparsed email content matches this filter's criteria."""
|
|
21
|
26
|
raise "Not implemented"
|
|
22
|
27
|
|
|
23
|
28
|
class BodyKeywordFilter(Filter):
|
|
|
@@ -37,6 +42,15 @@ class BodyKeywordFilter(Filter):
|
|
37
|
42
|
return True
|
|
38
|
43
|
return False
|
|
39
|
44
|
|
|
|
45
|
+ def matches_raw(self, raw_content: str) -> bool:
|
|
|
46
|
+ if self.case_sensitive:
|
|
|
47
|
+ if self.keyword in raw_content:
|
|
|
48
|
+ return True
|
|
|
49
|
+ else:
|
|
|
50
|
+ if self.keyword.lower() in raw_content.lower():
|
|
|
51
|
+ return True
|
|
|
52
|
+ return False
|
|
|
53
|
+
|
|
40
|
54
|
class HeaderFilter(Filter):
|
|
41
|
55
|
"""Matches a value in an email header. Can search one filter or multiple.
|
|
42
|
56
|
Header names case-insensitive; value is case-insensitive."""
|
|
|
@@ -70,10 +84,19 @@ class BooleanFilter(Filter):
|
|
70
|
84
|
return True
|
|
71
|
85
|
return False
|
|
72
|
86
|
|
|
73
|
|
-start_path = '.'
|
|
74
|
|
-output_path = TemporaryDirectory(prefix='Email search results (id ', suffix=')').name
|
|
|
87
|
+ def matches_raw(self, raw_content: str) -> bool:
|
|
|
88
|
+ for subfilter in self.subfilters:
|
|
|
89
|
+ result = subfilter.matches_raw(raw_content)
|
|
|
90
|
+ if self.operator == BooleanOperator.and_op and not result:
|
|
|
91
|
+ return False
|
|
|
92
|
+ if self.operator == BooleanOperator.or_op and result:
|
|
|
93
|
+ return True
|
|
|
94
|
+ if self.operator == BooleanOperator.and_op:
|
|
|
95
|
+ return True
|
|
|
96
|
+ return False
|
|
|
97
|
+
|
|
|
98
|
+args: argparse.Namespace = None
|
|
75
|
99
|
filter = None
|
|
76
|
|
-case_sensitive = False
|
|
77
|
100
|
result_count = 0
|
|
78
|
101
|
zip_result_count = 0
|
|
79
|
102
|
zip_count = 0
|
|
|
@@ -130,9 +153,9 @@ def process_zip_file(zip_path: str) -> None:
|
|
130
|
153
|
email = parser.parsebytes(data)
|
|
131
|
154
|
search_content(email, zip_path, entry)
|
|
132
|
155
|
except UnicodeError:
|
|
133
|
|
- print('Unicode error in message. Skipping.')
|
|
|
156
|
+ print('Unicode error in message. Searching raw content.', file=sys.stderr)
|
|
134
|
157
|
except:
|
|
135
|
|
- print('Error reading message')
|
|
|
158
|
+ print('Error parsing message. Searching raw content.', file=sys.stderr)
|
|
136
|
159
|
if zip_result_count > 0:
|
|
137
|
160
|
print(f"\t{zip_result_count} results in zip")
|
|
138
|
161
|
|
|
|
@@ -147,56 +170,133 @@ def search_content(email: EmailMessage, zip_path: str, entry: ZipInfo) -> None:
|
|
147
|
170
|
zip_result_count += 1
|
|
148
|
171
|
f.write(email.as_bytes())
|
|
149
|
172
|
|
|
|
173
|
+def search_raw_content(raw_bytes: bytes, zip_path: str, entry: ZipInfo) -> None:
|
|
|
174
|
+ global result_count, zip_result_count
|
|
|
175
|
+ try:
|
|
|
176
|
+ content = raw_bytes.decode('iso-8859-1', errors='ignore')
|
|
|
177
|
+ except:
|
|
|
178
|
+ try:
|
|
|
179
|
+ content = raw_bytes.decode('utf-8', errors='ignore')
|
|
|
180
|
+ except:
|
|
|
181
|
+ print('Cannot decode email bytes. Skipping.', file=sys.stderr)
|
|
|
182
|
+
|
|
150
|
183
|
def parse_arguments():
|
|
151
|
|
- """Parses the command-line arguments."""
|
|
152
|
|
- global filter
|
|
153
|
|
- global start_path
|
|
154
|
|
- global output_path
|
|
155
|
|
- global case_sensitive
|
|
156
|
|
- expect = 'script_name'
|
|
157
|
|
- for arg in sys.argv:
|
|
158
|
|
- if arg.startswith('-'):
|
|
159
|
|
- if arg == '-d':
|
|
160
|
|
- expect = 'start_path'
|
|
161
|
|
- elif arg == '-o':
|
|
162
|
|
- expect = 'output_path'
|
|
163
|
|
- elif arg == '-c':
|
|
164
|
|
- case_sensitive = True
|
|
165
|
|
- else:
|
|
166
|
|
- raise f'Unknown argument {arg}'
|
|
167
|
|
- elif expect is not None:
|
|
168
|
|
- if expect == 'script_name':
|
|
169
|
|
- expect = None
|
|
170
|
|
- continue
|
|
171
|
|
- elif expect == 'start_path':
|
|
172
|
|
- start_path = arg
|
|
173
|
|
- expect = None
|
|
174
|
|
- elif expect == 'output_path':
|
|
175
|
|
- output_path = arg
|
|
176
|
|
- expect = None
|
|
177
|
|
- else:
|
|
178
|
|
- raise f'Expected other argument {expect}'
|
|
179
|
|
- else:
|
|
180
|
|
- if filter is None:
|
|
181
|
|
- words = arg.split(' ')
|
|
182
|
|
- word_filters = []
|
|
183
|
|
- for word in words:
|
|
184
|
|
- word = word.strip()
|
|
185
|
|
- if len(word) == 0:
|
|
186
|
|
- continue
|
|
187
|
|
- word_filters.append(BodyKeywordFilter(word, case_sensitive))
|
|
188
|
|
- if len(word_filters) == 0:
|
|
189
|
|
- continue
|
|
190
|
|
- filter = BooleanFilter(BooleanOperator.and_op, word_filters)
|
|
191
|
|
- else:
|
|
192
|
|
- print('Too many arguments')
|
|
193
|
|
- sys.exit(4)
|
|
|
184
|
+ """Parses command-line arguments to `args`."""
|
|
|
185
|
+ global args
|
|
|
186
|
+ parser = argparse.ArgumentParser(
|
|
|
187
|
+ prog='search.py',
|
|
|
188
|
+ description='Searches a directory of zipped email messages. ' + \
|
|
|
189
|
+ 'Messages are assumed to be stored one per file within the zip files (Maildir format). ' + \
|
|
|
190
|
+ 'Input directories are searched recursively for any zip files contained within.',
|
|
|
191
|
+ epilog='Raw mode will skip parsing each email message and treat them like simple text files. ' + \
|
|
|
192
|
+ 'The headers and body are all searched together without decoding. ' + \
|
|
|
193
|
+ 'Arguments for searching individual fields will be ignored. ' + \
|
|
|
194
|
+ 'This option exists for messages with encoding errors that prevent them from being ' + \
|
|
|
195
|
+ 'parsed correctly. ' + \
|
|
|
196
|
+ 'Note that various escaping/encoding schemes commonly used in email messages, such ' + \
|
|
|
197
|
+ 'as base64, may cause keywords to not be found despite being in the decoded message ' + \
|
|
|
198
|
+ 'because only the raw encoded content is searched. ' + \
|
|
|
199
|
+ 'Use this option as a last resort.'
|
|
|
200
|
+ )
|
|
|
201
|
+ parser.add_argument(
|
|
|
202
|
+ 'keywords',
|
|
|
203
|
+ action='append',
|
|
|
204
|
+ nargs='+',
|
|
|
205
|
+ help='one or more phrases to search for in the message body'
|
|
|
206
|
+ )
|
|
|
207
|
+ parser.add_argument(
|
|
|
208
|
+ '--any',
|
|
|
209
|
+ default=False,
|
|
|
210
|
+ action='store_true',
|
|
|
211
|
+ help='matches messages containing any of the given search phrases (default requires all phrases appear in a message)'
|
|
|
212
|
+ )
|
|
|
213
|
+ parser.add_argument(
|
|
|
214
|
+ '-d', '--dir',
|
|
|
215
|
+ action='append',
|
|
|
216
|
+ help='directory(s) to search for email zip archives (default is working directory)'
|
|
|
217
|
+ )
|
|
|
218
|
+ parser.add_argument(
|
|
|
219
|
+ '-o', '--output',
|
|
|
220
|
+ help='directory to copy matching messages to (default is a temp directory)'
|
|
|
221
|
+ )
|
|
|
222
|
+ parser.add_argument(
|
|
|
223
|
+ '-c', '--casesensitive',
|
|
|
224
|
+ default=False,
|
|
|
225
|
+ action='store_true',
|
|
|
226
|
+ help='search case-sensitively (default is case-insensitive)'
|
|
|
227
|
+ )
|
|
|
228
|
+ parser.add_argument(
|
|
|
229
|
+ '-f', '--from',
|
|
|
230
|
+ help='email address of sender'
|
|
|
231
|
+ )
|
|
|
232
|
+ parser.add_argument(
|
|
|
233
|
+ '-t', '--to',
|
|
|
234
|
+ help='email address of recipient (searches to:, cc:, bcc: fields)'
|
|
|
235
|
+ )
|
|
|
236
|
+ parser.add_argument(
|
|
|
237
|
+ '-s', '--subject',
|
|
|
238
|
+ help='searches subject field'
|
|
|
239
|
+ )
|
|
|
240
|
+ parser.add_argument(
|
|
|
241
|
+ '-a', '--after',
|
|
|
242
|
+ metavar='YYYY-MM-DD',
|
|
|
243
|
+ help='date to search on or after'
|
|
|
244
|
+ )
|
|
|
245
|
+ parser.add_argument(
|
|
|
246
|
+ '-b', '--before',
|
|
|
247
|
+ metavar='YYYY-MM-DD',
|
|
|
248
|
+ help='date to search on or before'
|
|
|
249
|
+ )
|
|
|
250
|
+ parser.add_argument(
|
|
|
251
|
+ '-r', '--raw',
|
|
|
252
|
+ default=False,
|
|
|
253
|
+ action='store_true',
|
|
|
254
|
+ help='searches raw email content (see below)'
|
|
|
255
|
+ )
|
|
|
256
|
+ args = parser.parse_args()
|
|
|
257
|
+
|
|
|
258
|
+ if args.before is not None:
|
|
|
259
|
+ m = re.match('^([0-9]{4})-([0-9]{2})-([0-9]{2})$', args.before)
|
|
|
260
|
+ if m is None:
|
|
|
261
|
+ parser.error('before date must be in YYYY-MM-DD format (e.g. 2015-03-28)')
|
|
|
262
|
+ args.before = [ int(m.group(1)), int(m.group(2)), int(m.group(3)) ]
|
|
|
263
|
+ if args.after is not None:
|
|
|
264
|
+ m = re.match('^([0-9]{4})-([0-9]{2})-([0-9]{2})$', args.after)
|
|
|
265
|
+ if m is None:
|
|
|
266
|
+ parser.error('after date must be in YYYY-MM-DD format (e.g. 2015-03-28)')
|
|
|
267
|
+ args.after = [ int(m.group(1)), int(m.group(2)), int(m.group(3)) ]
|
|
|
268
|
+ if args.raw:
|
|
|
269
|
+ if getattr(args, 'from') is not None or \
|
|
|
270
|
+ getattr(args, 'to') is not None or \
|
|
|
271
|
+ args.subject is not None or \
|
|
|
272
|
+ args.before is not None or \
|
|
|
273
|
+ args.after is not None:
|
|
|
274
|
+ print('Warning: Cannot search header fields in raw mode. Ignoring.', file=sys.stderr)
|
|
|
275
|
+ if args.dir is None:
|
|
|
276
|
+ args.dir = [ '.' ]
|
|
|
277
|
+ else:
|
|
|
278
|
+ for d in args.dir:
|
|
|
279
|
+ if not os.path.exists(d) or not os.path.isdir(d):
|
|
|
280
|
+ parser.error(f'search path \'{d}\' does not exist or is not a directory')
|
|
|
281
|
+ if args.output is None:
|
|
|
282
|
+ args.output = TemporaryDirectory(prefix='Email search results (id ', suffix=')').name
|
|
|
283
|
+ else:
|
|
|
284
|
+ if not os.path.exists(args.output) or not os.path.isdir(args.output):
|
|
|
285
|
+ parser.error(f'output path \'{args.output}\' does not exist or is not a directory')
|
|
194
|
286
|
|
|
195
|
|
-def validate_arguments():
|
|
196
|
|
- if filter is None:
|
|
197
|
|
- print('No filter specified')
|
|
198
|
|
- sys.exit(3)
|
|
199
|
|
- pass
|
|
|
287
|
+def construct_filter():
|
|
|
288
|
+ global filter
|
|
|
289
|
+ criteria: List[Filter] = []
|
|
|
290
|
+ keyword_filters = map(lambda k : BodyKeywordFilter(k, case_sensitive=args.casesensitive), args.keywords)
|
|
|
291
|
+ criteria.append(BooleanFilter(BooleanOperator.or_op if args.any else BooleanOperator.and_op, keyword_filters))
|
|
|
292
|
+ if getattr(args, 'from') is not None:
|
|
|
293
|
+ criteria.append(HeaderFilter('from', getattr(args, 'from')))
|
|
|
294
|
+ if getattr(args, 'to') is not None:
|
|
|
295
|
+ criteria.append(HeaderFilter(['to', 'cc', 'bcc'], getattr(args, 'to')))
|
|
|
296
|
+ if args.subject is not None:
|
|
|
297
|
+ criteria.append(HeaderFilter('subject', args.subject))
|
|
|
298
|
+ # TODO: Dates
|
|
|
299
|
+ filter = BooleanFilter(BooleanOperator.and_op, criteria)
|
|
200
|
300
|
|
|
201
|
301
|
def handle_results():
|
|
202
|
302
|
"""Final logic after all searching is completed."""
|
|
|
@@ -215,6 +315,6 @@ def handle_results():
|
|
215
|
315
|
|
|
216
|
316
|
# Main logic
|
|
217
|
317
|
parse_arguments()
|
|
218
|
|
-validate_arguments()
|
|
|
318
|
+construct_filter()
|
|
219
|
319
|
walk_directory(start_path)
|
|
220
|
320
|
handle_results()
|