Преглед изворни кода

Can now search various fields. Better command line parsing.

master
Rocketsoup пре 2 година
родитељ
комит
7e5419b280
2 измењених фајлова са 156 додато и 54 уклоњено
  1. 6
    0
      .pylintrc
  2. 150
    54
      search.py

+ 6
- 0
.pylintrc Прегледај датотеку

1
+[MESSAGES CONTROL]
2
+disable=bad-indentation, invalid-name
3
+
4
+[FORMAT]
5
+
6
+indent-string=' '

+ 150
- 54
search.py Прегледај датотеку

3
 import platform
3
 import platform
4
 import subprocess
4
 import subprocess
5
 from tempfile import TemporaryDirectory
5
 from tempfile import TemporaryDirectory
6
-from typing import List, Union
6
+from typing import List, Optional, Union
7
 from enum import Enum
7
 from enum import Enum
8
 from email.utils import parsedate
8
 from email.utils import parsedate
9
 from email.parser import BytesParser
9
 from email.parser import BytesParser
10
 from email.message import EmailMessage
10
 from email.message import EmailMessage
11
-from zipfile import ZipFile, ZipInfo
11
+from zipfile import ZipFile
12
 import sys
12
 import sys
13
 import os
13
 import os
14
 
14
 
15
 class BooleanOperator(Enum):
15
 class BooleanOperator(Enum):
16
+	"""Boolean combinatory operator enum."""
16
 	and_op = 1
17
 	and_op = 1
17
 	or_op = 2
18
 	or_op = 2
18
 
19
 
20
 	"""Base class for message filters."""
21
 	"""Base class for message filters."""
21
 	def matches(self, message: EmailMessage) -> bool:
22
 	def matches(self, message: EmailMessage) -> bool:
22
 		"""Returns true if the given email message matches this filter's criteria."""
23
 		"""Returns true if the given email message matches this filter's criteria."""
23
-		raise "Not implemented"
24
+		raise AssertionError("Not implemented")
24
 	def matches_raw(self, raw_content: str) -> bool:
25
 	def matches_raw(self, raw_content: str) -> bool:
25
 		"""Returns true if the given raw, unparsed email content matches this filter's criteria."""
26
 		"""Returns true if the given raw, unparsed email content matches this filter's criteria."""
26
-		raise "Not implemented"
27
+		raise AssertionError("Not implemented")
28
+	def supports_raw(self) -> bool:
29
+		"""Whether this filter supports raw messages at least partially."""
30
+		return False
27
 
31
 
28
 class BodyKeywordFilter(Filter):
32
 class BodyKeywordFilter(Filter):
29
 	"""Simple substring search filter."""
33
 	"""Simple substring search filter."""
46
 		if self.case_sensitive:
50
 		if self.case_sensitive:
47
 			if self.keyword in raw_content:
51
 			if self.keyword in raw_content:
48
 				return True
52
 				return True
49
-			else:
50
-				if self.keyword.lower() in raw_content.lower():
51
-					return True
53
+		else:
54
+			if self.keyword.lower() in raw_content.lower():
55
+				return True
52
 		return False
56
 		return False
53
 
57
 
58
+	def supports_raw(self) -> bool:
59
+		return True
60
+
54
 class HeaderFilter(Filter):
61
 class HeaderFilter(Filter):
55
 	"""Matches a value in an email header. Can search one filter or multiple.
62
 	"""Matches a value in an email header. Can search one filter or multiple.
56
 	Header names case-insensitive; value is case-insensitive."""
63
 	Header names case-insensitive; value is case-insensitive."""
95
 			return True
102
 			return True
96
 		return False
103
 		return False
97
 
104
 
98
-args: argparse.Namespace = None
99
-filter = None
105
+	def supports_raw(self) -> bool:
106
+		for subfilter in self.subfilters:
107
+			if subfilter.supports_raw():
108
+				return True
109
+		return False
110
+
111
+class DateFilter(Filter):
112
+	"""Filters messages based on the date field. For each message with a parseable
113
+	date field, the given comparator is called with a `maketime` list representation
114
+	of the date and time. The comparator must return a bool of whether to match
115
+	the given date or not."""
116
+	def __init__(self, comparator):
117
+		self.comparator = comparator
118
+
119
+	def matches(self, message: EmailMessage) -> bool:
120
+		date_str = message.get('date', None)
121
+		if date_str is None:
122
+			return False
123
+		date_elems = parsedate(date_str)
124
+		if date_elems is None:
125
+			return False
126
+		return self.comparator(date_elems)
127
+
128
+class Options:
129
+	"""Parsed command-line options."""
130
+	def __init__(self):
131
+		self.keywords: List[str] = []
132
+		self.any: bool = False
133
+		self.dir: List[str] = []
134
+		self.output: Optional[str] = None
135
+		self.casesensitive: bool = False
136
+		setattr(self, 'from', None)
137
+		setattr(self, 'to', None)
138
+		self.subject: Optional[str] = None
139
+		self.before: Optional[List[int]] = None
140
+		self.after: Optional[List[int]] = None
141
+		self.raw: bool = False
142
+
143
+args: Options = Options()
144
+message_filter: Filter = None
100
 result_count = 0
145
 result_count = 0
101
 zip_result_count = 0
146
 zip_result_count = 0
102
 zip_count = 0
147
 zip_count = 0
148
+parser: argparse.ArgumentParser = None
149
+
150
+def compare_dates(a: List[int], b: List[int]) -> int:
151
+	"""Compares two list representations of `maketime` date-times. Returns -1 if a < b,
152
+	1 if a > b, and 0 if they are equal."""
153
+	for i in range(6):
154
+		a_elem = a[i] if i < len(a) else -1
155
+		b_elem = b[i] if i < len(b) else -1
156
+		if a_elem < b_elem:
157
+			return -1
158
+		if a_elem > b_elem:
159
+			return 1
160
+	return 0
103
 
161
 
104
 def clean_filename(original: str) -> str:
162
 def clean_filename(original: str) -> str:
105
 	"""Returns a scrubbed string with safe filename characters."""
163
 	"""Returns a scrubbed string with safe filename characters."""
108
 def filename_from_email(email: EmailMessage) -> str:
166
 def filename_from_email(email: EmailMessage) -> str:
109
 	"""Creates a safe filename to save the given email to."""
167
 	"""Creates a safe filename to save the given email to."""
110
 	filename = ''
168
 	filename = ''
111
-	date_str = email.get('Date', None)
169
+	date_str = email.get('date', None)
112
 	if date_str is not None:
170
 	if date_str is not None:
113
 		parsed_date = parsedate(date_str)
171
 		parsed_date = parsedate(date_str)
114
 		if parsed_date is not None:
172
 		if parsed_date is not None:
119
 			filename += '0000-00-00T00.00.00 - '
177
 			filename += '0000-00-00T00.00.00 - '
120
 	else:
178
 	else:
121
 		filename += '0000-00-00T00.00.00 - '
179
 		filename += '0000-00-00T00.00.00 - '
122
-	subject = email.get('Subject')
180
+	filename += f'{result_count:04} - '
181
+	subject = email.get('subject')
123
 	if subject is not None:
182
 	if subject is not None:
124
 		filename += clean_filename(subject)[0:50].strip()
183
 		filename += clean_filename(subject)[0:50].strip()
125
 	else:
184
 	else:
127
 	filename += '.eml'
186
 	filename += '.eml'
128
 	return filename
187
 	return filename
129
 
188
 
130
-def walk_directory(dir: str) -> None:
189
+def walk_directory(path: str) -> None:
131
 	"""Spiders a directory looking for subdirectories and email zip archives."""
190
 	"""Spiders a directory looking for subdirectories and email zip archives."""
132
 	global zip_count
191
 	global zip_count
133
-	for f in os.listdir(dir):
134
-		full_path = dir + os.sep + f
192
+	for f in os.listdir(path):
193
+		full_path = path + os.sep + f
135
 		if f.lower().endswith('.zip'):
194
 		if f.lower().endswith('.zip'):
136
 			zip_count += 1
195
 			zip_count += 1
137
 			process_zip_file(full_path)
196
 			process_zip_file(full_path)
138
-		if os.path.isdir(f):
197
+		if os.path.isdir(full_path):
139
 			walk_directory(full_path)
198
 			walk_directory(full_path)
140
 
199
 
141
 def process_zip_file(zip_path: str) -> None:
200
 def process_zip_file(zip_path: str) -> None:
151
 			parser = BytesParser()
210
 			parser = BytesParser()
152
 			try:
211
 			try:
153
 				email = parser.parsebytes(data)
212
 				email = parser.parsebytes(data)
154
-				search_content(email, zip_path, entry)
155
-			except UnicodeError:
156
-				print('Unicode error in message. Searching raw content.', file=sys.stderr)
213
+				search_content(email)
157
 			except:
214
 			except:
158
-				print('Error parsing message. Searching raw content.', file=sys.stderr)
215
+				if message_filter.supports_raw():
216
+					search_raw_content(data)
217
+				else:
218
+					print('Message cannot be parsed. Skipping.')
159
 	if zip_result_count > 0:
219
 	if zip_result_count > 0:
160
 		print(f"\t{zip_result_count} results in zip")
220
 		print(f"\t{zip_result_count} results in zip")
161
 
221
 
162
-def search_content(email: EmailMessage, zip_path: str, entry: ZipInfo) -> None:
222
+def search_content(email: EmailMessage) -> None:
163
 	"""Processes an email message in a zip file."""
223
 	"""Processes an email message in a zip file."""
164
 	global result_count, zip_result_count
224
 	global result_count, zip_result_count
165
-	if filter.matches(email):
166
-		if not os.path.exists(output_path):
167
-			os.makedirs(output_path)
168
-		with open(output_path + os.sep + filename_from_email(email), 'wb') as f:
169
-			result_count += 1
170
-			zip_result_count += 1
171
-			f.write(email.as_bytes())
225
+	if message_filter.matches(email):
226
+		save_message(email)
172
 
227
 
173
-def search_raw_content(raw_bytes: bytes, zip_path: str, entry: ZipInfo) -> None:
228
+def search_raw_content(raw_bytes: bytes) -> None:
174
 	global result_count, zip_result_count
229
 	global result_count, zip_result_count
175
-	try:
176
-		content = raw_bytes.decode('iso-8859-1', errors='ignore')
177
-	except:
230
+	encodings = [ 'ascii', 'iso-8859-1', 'utf-8' ]
231
+	content = None
232
+	for encoding in encodings:
178
 		try:
233
 		try:
179
-			content = raw_bytes.decode('utf-8', errors='ignore')
234
+			content = raw_bytes.decode(encoding)
235
+			break
180
 		except:
236
 		except:
181
-			print('Cannot decode email bytes. Skipping.', file=sys.stderr)
237
+			pass
238
+	if content is None:
239
+		print('Cannot decode email bytes. Skipping message.', file=sys.stderr)
240
+		return
241
+	print('Could not parse message. Searching raw content.', file=sys.stderr)
242
+	if message_filter.matches_raw(content):
243
+		save_raw_message(raw_bytes)
244
+
245
+def save_message(email: EmailMessage) -> None:
246
+	"""Saves a matching message to the results directory."""
247
+	global result_count, zip_result_count
248
+	if not os.path.exists(args.output):
249
+		os.makedirs(args.output)
250
+	with open(args.output + os.sep + filename_from_email(email), 'wb') as f:
251
+		result_count += 1
252
+		zip_result_count += 1
253
+		f.write(email.as_bytes())
254
+
255
+def save_raw_message(content: bytes) -> None:
256
+	"""Saves an unparseable matching message to the results directory."""
257
+	global result_count, zip_result_count
258
+	if not os.path.exists(args.output):
259
+		os.makedirs(args.output)
260
+	filename = f'unparseable-match-{result_count:04}.eml'
261
+	with open(args.output + os.sep + filename, 'wb') as f:
262
+		result_count += 1
263
+		zip_result_count += 1
264
+		f.write(content)
182
 
265
 
183
 def parse_arguments():
266
 def parse_arguments():
184
 	"""Parses command-line arguments to `args`."""
267
 	"""Parses command-line arguments to `args`."""
185
-	global args
268
+	global args, parser
269
+	# TODO: Revisit raw mode and how unparseable emails should be handled
186
 	parser = argparse.ArgumentParser(
270
 	parser = argparse.ArgumentParser(
187
 		prog='search.py',
271
 		prog='search.py',
188
 		description='Searches a directory of zipped email messages. ' + \
272
 		description='Searches a directory of zipped email messages. ' + \
189
 			'Messages are assumed to be stored one per file within the zip files (Maildir format). ' + \
273
 			'Messages are assumed to be stored one per file within the zip files (Maildir format). ' + \
190
 			'Input directories are searched recursively for any zip files contained within.',
274
 			'Input directories are searched recursively for any zip files contained within.',
191
-		epilog='Raw mode will skip parsing each email message and treat them like simple text files. ' + \
192
-			'The headers and body are all searched together without decoding. ' + \
193
-			'Arguments for searching individual fields will be ignored. ' + \
194
-			'This option exists for messages with encoding errors that prevent them from being ' + \
195
-			'parsed correctly. ' + \
196
-			'Note that various escaping/encoding schemes commonly used in email messages, such ' + \
197
-			'as base64, may cause keywords to not be found despite being in the decoded message ' + \
198
-			'because only the raw encoded content is searched. ' + \
199
-			'Use this option as a last resort.'
275
+		epilog='If raw mode is enabled, any messages that cannot be decoded ' + \
276
+			'will be searched as raw text.'
200
 	)
277
 	)
201
 	parser.add_argument(
278
 	parser.add_argument(
202
 		'keywords',
279
 		'keywords',
203
 		action='append',
280
 		action='append',
204
-		nargs='+',
281
+		nargs='*',
205
 		help='one or more phrases to search for in the message body'
282
 		help='one or more phrases to search for in the message body'
206
 	)
283
 	)
207
 	parser.add_argument(
284
 	parser.add_argument(
208
 		'--any',
285
 		'--any',
209
 		default=False,
286
 		default=False,
210
 		action='store_true',
287
 		action='store_true',
211
-		help='matches messages containing any of the given search phrases (default requires all phrases appear in a message)'
288
+		help='matches messages containing any of the given search phrases (default requires ' + \
289
+			'all phrases appear in a message)'
212
 	)
290
 	)
213
 	parser.add_argument(
291
 	parser.add_argument(
214
 		'-d', '--dir',
292
 		'-d', '--dir',
227
 	)
305
 	)
228
 	parser.add_argument(
306
 	parser.add_argument(
229
 		'-f', '--from',
307
 		'-f', '--from',
308
+		metavar='sender-email',
230
 		help='email address of sender'
309
 		help='email address of sender'
231
 	)
310
 	)
232
 	parser.add_argument(
311
 	parser.add_argument(
233
 		'-t', '--to',
312
 		'-t', '--to',
313
+		metavar='recipient-email',
234
 		help='email address of recipient (searches to:, cc:, bcc: fields)'
314
 		help='email address of recipient (searches to:, cc:, bcc: fields)'
235
 	)
315
 	)
236
 	parser.add_argument(
316
 	parser.add_argument(
251
 		'-r', '--raw',
331
 		'-r', '--raw',
252
 		default=False,
332
 		default=False,
253
 		action='store_true',
333
 		action='store_true',
254
-		help='searches raw email content (see below)'
334
+		help='allows searching unparseable messages as raw text'
255
 	)
335
 	)
256
 	args = parser.parse_args()
336
 	args = parser.parse_args()
257
 
337
 
338
+def validate_arguments():
339
+	"""Validate and parse special field types"""
340
+	global args
341
+	args.keywords = args.keywords[0]  # no idea why it nests it 2D
258
 	if args.before is not None:
342
 	if args.before is not None:
259
 		m = re.match('^([0-9]{4})-([0-9]{2})-([0-9]{2})$', args.before)
343
 		m = re.match('^([0-9]{4})-([0-9]{2})-([0-9]{2})$', args.before)
260
 		if m is None:
344
 		if m is None:
285
 			parser.error(f'output path \'{args.output}\' does not exist or is not a directory')
369
 			parser.error(f'output path \'{args.output}\' does not exist or is not a directory')
286
 
370
 
287
 def construct_filter():
371
 def construct_filter():
288
-	global filter
372
+	global message_filter
289
 	criteria: List[Filter] = []
373
 	criteria: List[Filter] = []
290
-	keyword_filters = map(lambda k : BodyKeywordFilter(k, case_sensitive=args.casesensitive), args.keywords)
291
-	criteria.append(BooleanFilter(BooleanOperator.or_op if args.any else BooleanOperator.and_op, keyword_filters))
374
+	keyword_filters = []
375
+	for k in args.keywords:
376
+		k = k.strip()
377
+		if len(k) > 0:
378
+			keyword_filters.append(BodyKeywordFilter(k, case_sensitive=args.casesensitive))
379
+	if len(keyword_filters) > 0:
380
+		criteria.append(BooleanFilter(BooleanOperator.or_op if args.any else BooleanOperator.and_op, keyword_filters))
292
 	if getattr(args, 'from') is not None:
381
 	if getattr(args, 'from') is not None:
293
 		criteria.append(HeaderFilter('from', getattr(args, 'from')))
382
 		criteria.append(HeaderFilter('from', getattr(args, 'from')))
294
 	if getattr(args, 'to') is not None:
383
 	if getattr(args, 'to') is not None:
295
 		criteria.append(HeaderFilter(['to', 'cc', 'bcc'], getattr(args, 'to')))
384
 		criteria.append(HeaderFilter(['to', 'cc', 'bcc'], getattr(args, 'to')))
296
 	if args.subject is not None:
385
 	if args.subject is not None:
297
 		criteria.append(HeaderFilter('subject', args.subject))
386
 		criteria.append(HeaderFilter('subject', args.subject))
298
-	# TODO: Dates
299
-	filter = BooleanFilter(BooleanOperator.and_op, criteria)
387
+	if args.before is not None:
388
+		criteria.append(DateFilter(lambda d: compare_dates(d, args.before) <= 0))
389
+	if args.after is not None:
390
+		criteria.append(DateFilter(lambda d: compare_dates(d, args.after) >= 0))
391
+	if len(criteria) == 0:
392
+		parser.error('No filters specified')
393
+	message_filter = BooleanFilter(BooleanOperator.and_op, criteria)
300
 
394
 
301
 def handle_results():
395
 def handle_results():
302
 	"""Final logic after all searching is completed."""
396
 	"""Final logic after all searching is completed."""
303
 	if result_count > 0:
397
 	if result_count > 0:
304
 		if platform.system() == 'Darwin':
398
 		if platform.system() == 'Darwin':
305
-			subprocess.call(['open', output_path])
399
+			subprocess.call(['open', args.output])
306
 		elif platform.system() == 'Windows':
400
 		elif platform.system() == 'Windows':
307
-			subprocess.call(['explorer.exe', output_path])
401
+			subprocess.call(['explorer.exe', args.output])
308
 		print(f'Found {result_count} result(s) total')
402
 		print(f'Found {result_count} result(s) total')
309
 	elif zip_count == 0:
403
 	elif zip_count == 0:
310
 		print('No zip files found')
404
 		print('No zip files found')
315
 
409
 
316
 # Main logic
410
 # Main logic
317
 parse_arguments()
411
 parse_arguments()
412
+validate_arguments()
318
 construct_filter()
413
 construct_filter()
319
-walk_directory(start_path)
414
+for path in args.dir:
415
+	walk_directory(path)
320
 handle_results()
416
 handle_results()

Loading…
Откажи
Сачувај