Quellcode durchsuchen

Can now search various fields. Better command line parsing.

master
Rocketsoup vor 2 Jahren
Ursprung
Commit
7e5419b280
2 geänderte Dateien mit 156 neuen und 54 gelöschten Zeilen
  1. 6
    0
      .pylintrc
  2. 150
    54
      search.py

+ 6
- 0
.pylintrc Datei anzeigen

@@ -0,0 +1,6 @@
1
+[MESSAGES CONTROL]
2
+disable=bad-indentation, invalid-name
3
+
4
+[FORMAT]
5
+
6
+indent-string=' '

+ 150
- 54
search.py Datei anzeigen

@@ -3,16 +3,17 @@ import re
3 3
 import platform
4 4
 import subprocess
5 5
 from tempfile import TemporaryDirectory
6
-from typing import List, Union
6
+from typing import List, Optional, Union
7 7
 from enum import Enum
8 8
 from email.utils import parsedate
9 9
 from email.parser import BytesParser
10 10
 from email.message import EmailMessage
11
-from zipfile import ZipFile, ZipInfo
11
+from zipfile import ZipFile
12 12
 import sys
13 13
 import os
14 14
 
15 15
 class BooleanOperator(Enum):
16
+	"""Boolean combinatory operator enum."""
16 17
 	and_op = 1
17 18
 	or_op = 2
18 19
 
@@ -20,10 +21,13 @@ class Filter:
20 21
 	"""Base class for message filters."""
21 22
 	def matches(self, message: EmailMessage) -> bool:
22 23
 		"""Returns true if the given email message matches this filter's criteria."""
23
-		raise "Not implemented"
24
+		raise AssertionError("Not implemented")
24 25
 	def matches_raw(self, raw_content: str) -> bool:
25 26
 		"""Returns true if the given raw, unparsed email content matches this filter's criteria."""
26
-		raise "Not implemented"
27
+		raise AssertionError("Not implemented")
28
+	def supports_raw(self) -> bool:
29
+		"""Whether this filter supports raw messages at least partially."""
30
+		return False
27 31
 
28 32
 class BodyKeywordFilter(Filter):
29 33
 	"""Simple substring search filter."""
@@ -46,11 +50,14 @@ class BodyKeywordFilter(Filter):
46 50
 		if self.case_sensitive:
47 51
 			if self.keyword in raw_content:
48 52
 				return True
49
-			else:
50
-				if self.keyword.lower() in raw_content.lower():
51
-					return True
53
+		else:
54
+			if self.keyword.lower() in raw_content.lower():
55
+				return True
52 56
 		return False
53 57
 
58
+	def supports_raw(self) -> bool:
59
+		return True
60
+
54 61
 class HeaderFilter(Filter):
55 62
 	"""Matches a value in an email header. Can search one filter or multiple.
56 63
 	Header names case-insensitive; value is case-insensitive."""
@@ -95,11 +102,62 @@ class BooleanFilter(Filter):
95 102
 			return True
96 103
 		return False
97 104
 
98
-args: argparse.Namespace = None
99
-filter = None
105
+	def supports_raw(self) -> bool:
106
+		for subfilter in self.subfilters:
107
+			if subfilter.supports_raw():
108
+				return True
109
+		return False
110
+
111
+class DateFilter(Filter):
112
+	"""Filters messages based on the date field. For each message with a parseable
113
+	date field, the given comparator is called with a `maketime` list representation
114
+	of the date and time. The comparator must return a bool of whether to match
115
+	the given date or not."""
116
+	def __init__(self, comparator):
117
+		self.comparator = comparator
118
+
119
+	def matches(self, message: EmailMessage) -> bool:
120
+		date_str = message.get('date', None)
121
+		if date_str is None:
122
+			return False
123
+		date_elems = parsedate(date_str)
124
+		if date_elems is None:
125
+			return False
126
+		return self.comparator(date_elems)
127
+
128
+class Options:
129
+	"""Parsed command-line options."""
130
+	def __init__(self):
131
+		self.keywords: List[str] = []
132
+		self.any: bool = False
133
+		self.dir: List[str] = []
134
+		self.output: Optional[str] = None
135
+		self.casesensitive: bool = False
136
+		setattr(self, 'from', None)
137
+		setattr(self, 'to', None)
138
+		self.subject: Optional[str] = None
139
+		self.before: Optional[List[int]] = None
140
+		self.after: Optional[List[int]] = None
141
+		self.raw: bool = False
142
+
143
+args: Options = Options()
144
+message_filter: Filter = None
100 145
 result_count = 0
101 146
 zip_result_count = 0
102 147
 zip_count = 0
148
+parser: argparse.ArgumentParser = None
149
+
150
+def compare_dates(a: List[int], b: List[int]) -> int:
151
+	"""Compares two list representations of `maketime` date-times. Returns -1 if a < b,
152
+	1 if a > b, and 0 if they are equal."""
153
+	for i in range(6):
154
+		a_elem = a[i] if i < len(a) else -1
155
+		b_elem = b[i] if i < len(b) else -1
156
+		if a_elem < b_elem:
157
+			return -1
158
+		if a_elem > b_elem:
159
+			return 1
160
+	return 0
103 161
 
104 162
 def clean_filename(original: str) -> str:
105 163
 	"""Returns a scrubbed string with safe filename characters."""
@@ -108,7 +166,7 @@ def clean_filename(original: str) -> str:
108 166
 def filename_from_email(email: EmailMessage) -> str:
109 167
 	"""Creates a safe filename to save the given email to."""
110 168
 	filename = ''
111
-	date_str = email.get('Date', None)
169
+	date_str = email.get('date', None)
112 170
 	if date_str is not None:
113 171
 		parsed_date = parsedate(date_str)
114 172
 		if parsed_date is not None:
@@ -119,7 +177,8 @@ def filename_from_email(email: EmailMessage) -> str:
119 177
 			filename += '0000-00-00T00.00.00 - '
120 178
 	else:
121 179
 		filename += '0000-00-00T00.00.00 - '
122
-	subject = email.get('Subject')
180
+	filename += f'{result_count:04} - '
181
+	subject = email.get('subject')
123 182
 	if subject is not None:
124 183
 		filename += clean_filename(subject)[0:50].strip()
125 184
 	else:
@@ -127,15 +186,15 @@ def filename_from_email(email: EmailMessage) -> str:
127 186
 	filename += '.eml'
128 187
 	return filename
129 188
 
130
-def walk_directory(dir: str) -> None:
189
+def walk_directory(path: str) -> None:
131 190
 	"""Spiders a directory looking for subdirectories and email zip archives."""
132 191
 	global zip_count
133
-	for f in os.listdir(dir):
134
-		full_path = dir + os.sep + f
192
+	for f in os.listdir(path):
193
+		full_path = path + os.sep + f
135 194
 		if f.lower().endswith('.zip'):
136 195
 			zip_count += 1
137 196
 			process_zip_file(full_path)
138
-		if os.path.isdir(f):
197
+		if os.path.isdir(full_path):
139 198
 			walk_directory(full_path)
140 199
 
141 200
 def process_zip_file(zip_path: str) -> None:
@@ -151,64 +210,83 @@ def process_zip_file(zip_path: str) -> None:
151 210
 			parser = BytesParser()
152 211
 			try:
153 212
 				email = parser.parsebytes(data)
154
-				search_content(email, zip_path, entry)
155
-			except UnicodeError:
156
-				print('Unicode error in message. Searching raw content.', file=sys.stderr)
213
+				search_content(email)
157 214
 			except:
158
-				print('Error parsing message. Searching raw content.', file=sys.stderr)
215
+				if message_filter.supports_raw():
216
+					search_raw_content(data)
217
+				else:
218
+					print('Message cannot be parsed. Skipping.')
159 219
 	if zip_result_count > 0:
160 220
 		print(f"\t{zip_result_count} results in zip")
161 221
 
162
-def search_content(email: EmailMessage, zip_path: str, entry: ZipInfo) -> None:
222
+def search_content(email: EmailMessage) -> None:
163 223
 	"""Processes an email message in a zip file."""
164 224
 	global result_count, zip_result_count
165
-	if filter.matches(email):
166
-		if not os.path.exists(output_path):
167
-			os.makedirs(output_path)
168
-		with open(output_path + os.sep + filename_from_email(email), 'wb') as f:
169
-			result_count += 1
170
-			zip_result_count += 1
171
-			f.write(email.as_bytes())
225
+	if message_filter.matches(email):
226
+		save_message(email)
172 227
 
173
-def search_raw_content(raw_bytes: bytes, zip_path: str, entry: ZipInfo) -> None:
228
+def search_raw_content(raw_bytes: bytes) -> None:
174 229
 	global result_count, zip_result_count
175
-	try:
176
-		content = raw_bytes.decode('iso-8859-1', errors='ignore')
177
-	except:
230
+	encodings = [ 'ascii', 'iso-8859-1', 'utf-8' ]
231
+	content = None
232
+	for encoding in encodings:
178 233
 		try:
179
-			content = raw_bytes.decode('utf-8', errors='ignore')
234
+			content = raw_bytes.decode(encoding)
235
+			break
180 236
 		except:
181
-			print('Cannot decode email bytes. Skipping.', file=sys.stderr)
237
+			pass
238
+	if content is None:
239
+		print('Cannot decode email bytes. Skipping message.', file=sys.stderr)
240
+		return
241
+	print('Could not parse message. Searching raw content.', file=sys.stderr)
242
+	if message_filter.matches_raw(content):
243
+		save_raw_message(raw_bytes)
244
+
245
+def save_message(email: EmailMessage) -> None:
246
+	"""Saves a matching message to the results directory."""
247
+	global result_count, zip_result_count
248
+	if not os.path.exists(args.output):
249
+		os.makedirs(args.output)
250
+	with open(args.output + os.sep + filename_from_email(email), 'wb') as f:
251
+		result_count += 1
252
+		zip_result_count += 1
253
+		f.write(email.as_bytes())
254
+
255
+def save_raw_message(content: bytes) -> None:
256
+	"""Saves an unparseable matching message to the results directory."""
257
+	global result_count, zip_result_count
258
+	if not os.path.exists(args.output):
259
+		os.makedirs(args.output)
260
+	filename = f'unparseable-match-{result_count:04}.eml'
261
+	with open(args.output + os.sep + filename, 'wb') as f:
262
+		result_count += 1
263
+		zip_result_count += 1
264
+		f.write(content)
182 265
 
183 266
 def parse_arguments():
184 267
 	"""Parses command-line arguments to `args`."""
185
-	global args
268
+	global args, parser
269
+	# TODO: Revisit raw mode and how unparseable emails should be handled
186 270
 	parser = argparse.ArgumentParser(
187 271
 		prog='search.py',
188 272
 		description='Searches a directory of zipped email messages. ' + \
189 273
 			'Messages are assumed to be stored one per file within the zip files (Maildir format). ' + \
190 274
 			'Input directories are searched recursively for any zip files contained within.',
191
-		epilog='Raw mode will skip parsing each email message and treat them like simple text files. ' + \
192
-			'The headers and body are all searched together without decoding. ' + \
193
-			'Arguments for searching individual fields will be ignored. ' + \
194
-			'This option exists for messages with encoding errors that prevent them from being ' + \
195
-			'parsed correctly. ' + \
196
-			'Note that various escaping/encoding schemes commonly used in email messages, such ' + \
197
-			'as base64, may cause keywords to not be found despite being in the decoded message ' + \
198
-			'because only the raw encoded content is searched. ' + \
199
-			'Use this option as a last resort.'
275
+		epilog='If raw mode is enabled, any messages that cannot be decoded ' + \
276
+			'will be searched as raw text.'
200 277
 	)
201 278
 	parser.add_argument(
202 279
 		'keywords',
203 280
 		action='append',
204
-		nargs='+',
281
+		nargs='*',
205 282
 		help='one or more phrases to search for in the message body'
206 283
 	)
207 284
 	parser.add_argument(
208 285
 		'--any',
209 286
 		default=False,
210 287
 		action='store_true',
211
-		help='matches messages containing any of the given search phrases (default requires all phrases appear in a message)'
288
+		help='matches messages containing any of the given search phrases (default requires ' + \
289
+			'all phrases appear in a message)'
212 290
 	)
213 291
 	parser.add_argument(
214 292
 		'-d', '--dir',
@@ -227,10 +305,12 @@ def parse_arguments():
227 305
 	)
228 306
 	parser.add_argument(
229 307
 		'-f', '--from',
308
+		metavar='sender-email',
230 309
 		help='email address of sender'
231 310
 	)
232 311
 	parser.add_argument(
233 312
 		'-t', '--to',
313
+		metavar='recipient-email',
234 314
 		help='email address of recipient (searches to:, cc:, bcc: fields)'
235 315
 	)
236 316
 	parser.add_argument(
@@ -251,10 +331,14 @@ def parse_arguments():
251 331
 		'-r', '--raw',
252 332
 		default=False,
253 333
 		action='store_true',
254
-		help='searches raw email content (see below)'
334
+		help='allows searching unparseable messages as raw text'
255 335
 	)
256 336
 	args = parser.parse_args()
257 337
 
338
+def validate_arguments():
339
+	"""Validate and parse special field types"""
340
+	global args
341
+	args.keywords = args.keywords[0]  # no idea why it nests it 2D
258 342
 	if args.before is not None:
259 343
 		m = re.match('^([0-9]{4})-([0-9]{2})-([0-9]{2})$', args.before)
260 344
 		if m is None:
@@ -285,26 +369,36 @@ def parse_arguments():
285 369
 			parser.error(f'output path \'{args.output}\' does not exist or is not a directory')
286 370
 
287 371
 def construct_filter():
288
-	global filter
372
+	global message_filter
289 373
 	criteria: List[Filter] = []
290
-	keyword_filters = map(lambda k : BodyKeywordFilter(k, case_sensitive=args.casesensitive), args.keywords)
291
-	criteria.append(BooleanFilter(BooleanOperator.or_op if args.any else BooleanOperator.and_op, keyword_filters))
374
+	keyword_filters = []
375
+	for k in args.keywords:
376
+		k = k.strip()
377
+		if len(k) > 0:
378
+			keyword_filters.append(BodyKeywordFilter(k, case_sensitive=args.casesensitive))
379
+	if len(keyword_filters) > 0:
380
+		criteria.append(BooleanFilter(BooleanOperator.or_op if args.any else BooleanOperator.and_op, keyword_filters))
292 381
 	if getattr(args, 'from') is not None:
293 382
 		criteria.append(HeaderFilter('from', getattr(args, 'from')))
294 383
 	if getattr(args, 'to') is not None:
295 384
 		criteria.append(HeaderFilter(['to', 'cc', 'bcc'], getattr(args, 'to')))
296 385
 	if args.subject is not None:
297 386
 		criteria.append(HeaderFilter('subject', args.subject))
298
-	# TODO: Dates
299
-	filter = BooleanFilter(BooleanOperator.and_op, criteria)
387
+	if args.before is not None:
388
+		criteria.append(DateFilter(lambda d: compare_dates(d, args.before) <= 0))
389
+	if args.after is not None:
390
+		criteria.append(DateFilter(lambda d: compare_dates(d, args.after) >= 0))
391
+	if len(criteria) == 0:
392
+		parser.error('No filters specified')
393
+	message_filter = BooleanFilter(BooleanOperator.and_op, criteria)
300 394
 
301 395
 def handle_results():
302 396
 	"""Final logic after all searching is completed."""
303 397
 	if result_count > 0:
304 398
 		if platform.system() == 'Darwin':
305
-			subprocess.call(['open', output_path])
399
+			subprocess.call(['open', args.output])
306 400
 		elif platform.system() == 'Windows':
307
-			subprocess.call(['explorer.exe', output_path])
401
+			subprocess.call(['explorer.exe', args.output])
308 402
 		print(f'Found {result_count} result(s) total')
309 403
 	elif zip_count == 0:
310 404
 		print('No zip files found')
@@ -315,6 +409,8 @@ def handle_results():
315 409
 
316 410
 # Main logic
317 411
 parse_arguments()
412
+validate_arguments()
318 413
 construct_filter()
319
-walk_directory(start_path)
414
+for path in args.dir:
415
+	walk_directory(path)
320 416
 handle_results()

Laden…
Abbrechen
Speichern