瀏覽代碼

argparse refactor WIP

argparse
Rocketsoup 2 年之前
父節點
當前提交
efba628bdb
共有 1 個檔案被更改,包括 154 行新增54 行删除
  1. 154
    54
      search.py

+ 154
- 54
search.py 查看文件

@@ -1,3 +1,4 @@
1
+import argparse
1 2
 import re
2 3
 import platform
3 4
 import subprocess
@@ -18,6 +19,10 @@ class BooleanOperator(Enum):
18 19
 class Filter:
19 20
 	"""Base class for message filters."""
20 21
 	def matches(self, message: EmailMessage) -> bool:
22
+		"""Returns true if the given email message matches this filter's criteria."""
23
+		raise "Not implemented"
24
+	def matches_raw(self, raw_content: str) -> bool:
25
+		"""Returns true if the given raw, unparsed email content matches this filter's criteria."""
21 26
 		raise "Not implemented"
22 27
 
23 28
 class BodyKeywordFilter(Filter):
@@ -37,6 +42,15 @@ class BodyKeywordFilter(Filter):
37 42
 						return True
38 43
 		return False
39 44
 
45
+	def matches_raw(self, raw_content: str) -> bool:
46
+		if self.case_sensitive:
47
+			if self.keyword in raw_content:
48
+				return True
49
+			else:
50
+				if self.keyword.lower() in raw_content.lower():
51
+					return True
52
+		return False
53
+
40 54
 class HeaderFilter(Filter):
41 55
 	"""Matches a value in an email header. Can search one filter or multiple.
42 56
 	Header names case-insensitive; value is case-insensitive."""
@@ -70,10 +84,19 @@ class BooleanFilter(Filter):
70 84
 			return True
71 85
 		return False
72 86
 
73
-start_path = '.'
74
-output_path = TemporaryDirectory(prefix='Email search results (id ', suffix=')').name
87
+	def matches_raw(self, raw_content: str) -> bool:
88
+		for subfilter in self.subfilters:
89
+			result = subfilter.matches_raw(raw_content)
90
+			if self.operator == BooleanOperator.and_op and not result:
91
+				return False
92
+			if self.operator == BooleanOperator.or_op and result:
93
+				return True
94
+		if self.operator == BooleanOperator.and_op:
95
+			return True
96
+		return False
97
+
98
+args: argparse.Namespace = None
75 99
 filter = None
76
-case_sensitive = False
77 100
 result_count = 0
78 101
 zip_result_count = 0
79 102
 zip_count = 0
@@ -130,9 +153,9 @@ def process_zip_file(zip_path: str) -> None:
130 153
 				email = parser.parsebytes(data)
131 154
 				search_content(email, zip_path, entry)
132 155
 			except UnicodeError:
133
-				print('Unicode error in message. Skipping.')
156
+				print('Unicode error in message. Searching raw content.', file=sys.stderr)
134 157
 			except:
135
-				print('Error reading message')
158
+				print('Error parsing message. Searching raw content.', file=sys.stderr)
136 159
 	if zip_result_count > 0:
137 160
 		print(f"\t{zip_result_count} results in zip")
138 161
 
@@ -147,56 +170,133 @@ def search_content(email: EmailMessage, zip_path: str, entry: ZipInfo) -> None:
147 170
 			zip_result_count += 1
148 171
 			f.write(email.as_bytes())
149 172
 
173
+def search_raw_content(raw_bytes: bytes, zip_path: str, entry: ZipInfo) -> None:
174
+	global result_count, zip_result_count
175
+	try:
176
+		content = raw_bytes.decode('iso-8859-1', errors='ignore')
177
+	except:
178
+		try:
179
+			content = raw_bytes.decode('utf-8', errors='ignore')
180
+		except:
181
+			print('Cannot decode email bytes. Skipping.', file=sys.stderr)
182
+
150 183
 def parse_arguments():
151
-	"""Parses the command-line arguments."""
152
-	global filter
153
-	global start_path
154
-	global output_path
155
-	global case_sensitive
156
-	expect = 'script_name'
157
-	for arg in sys.argv:
158
-		if arg.startswith('-'):
159
-			if arg == '-d':
160
-				expect = 'start_path'
161
-			elif arg == '-o':
162
-				expect = 'output_path'
163
-			elif arg == '-c':
164
-				case_sensitive = True
165
-			else:
166
-				raise f'Unknown argument {arg}'
167
-		elif expect is not None:
168
-			if expect == 'script_name':
169
-				expect = None
170
-				continue
171
-			elif expect == 'start_path':
172
-				start_path = arg
173
-				expect = None
174
-			elif expect == 'output_path':
175
-				output_path = arg
176
-				expect = None
177
-			else:
178
-				raise f'Expected other argument {expect}'
179
-		else:
180
-			if filter is None:
181
-				words = arg.split(' ')
182
-				word_filters = []
183
-				for word in words:
184
-					word = word.strip()
185
-					if len(word) == 0:
186
-						continue
187
-					word_filters.append(BodyKeywordFilter(word, case_sensitive))
188
-				if len(word_filters) == 0:
189
-					continue
190
-				filter = BooleanFilter(BooleanOperator.and_op, word_filters)
191
-			else:
192
-				print('Too many arguments')
193
-				sys.exit(4)
184
+	"""Parses command-line arguments to `args`."""
185
+	global args
186
+	parser = argparse.ArgumentParser(
187
+		prog='search.py',
188
+		description='Searches a directory of zipped email messages. ' + \
189
+			'Messages are assumed to be stored one per file within the zip files (Maildir format). ' + \
190
+			'Input directories are searched recursively for any zip files contained within.',
191
+		epilog='Raw mode will skip parsing each email message and treat them like simple text files. ' + \
192
+			'The headers and body are all searched together without decoding. ' + \
193
+			'Arguments for searching individual fields will be ignored. ' + \
194
+			'This option exists for messages with encoding errors that prevent them from being ' + \
195
+			'parsed correctly. ' + \
196
+			'Note that various escaping/encoding schemes commonly used in email messages, such ' + \
197
+			'as base64, may cause keywords to not be found despite being in the decoded message ' + \
198
+			'because only the raw encoded content is searched. ' + \
199
+			'Use this option as a last resort.'
200
+	)
201
+	parser.add_argument(
202
+		'keywords',
203
+		action='append',
204
+		nargs='+',
205
+		help='one or more phrases to search for in the message body'
206
+	)
207
+	parser.add_argument(
208
+		'--any',
209
+		default=False,
210
+		action='store_true',
211
+		help='matches messages containing any of the given search phrases (default requires all phrases appear in a message)'
212
+	)
213
+	parser.add_argument(
214
+		'-d', '--dir',
215
+		action='append',
216
+		help='directory(s) to search for email zip archives (default is working directory)'
217
+	)
218
+	parser.add_argument(
219
+		'-o', '--output',
220
+		help='directory to copy matching messages to (default is a temp directory)'
221
+	)
222
+	parser.add_argument(
223
+		'-c', '--casesensitive',
224
+		default=False,
225
+		action='store_true',
226
+		help='search case-sensitively (default is case-insensitive)'
227
+	)
228
+	parser.add_argument(
229
+		'-f', '--from',
230
+		help='email address of sender'
231
+	)
232
+	parser.add_argument(
233
+		'-t', '--to',
234
+		help='email address of recipient (searches to:, cc:, bcc: fields)'
235
+	)
236
+	parser.add_argument(
237
+		'-s', '--subject',
238
+		help='searches subject field'
239
+	)
240
+	parser.add_argument(
241
+		'-a', '--after',
242
+		metavar='YYYY-MM-DD',
243
+		help='date to search on or after'
244
+	)
245
+	parser.add_argument(
246
+		'-b', '--before',
247
+		metavar='YYYY-MM-DD',
248
+		help='date to search on or before'
249
+	)
250
+	parser.add_argument(
251
+		'-r', '--raw',
252
+		default=False,
253
+		action='store_true',
254
+		help='searches raw email content (see below)'
255
+	)
256
+	args = parser.parse_args()
257
+
258
+	if args.before is not None:
259
+		m = re.match('^([0-9]{4})-([0-9]{2})-([0-9]{2})$', args.before)
260
+		if m is None:
261
+			parser.error('before date must be in YYYY-MM-DD format (e.g. 2015-03-28)')
262
+		args.before = [ int(m.group(1)), int(m.group(2)), int(m.group(3)) ]
263
+	if args.after is not None:
264
+		m = re.match('^([0-9]{4})-([0-9]{2})-([0-9]{2})$', args.after)
265
+		if m is None:
266
+			parser.error('after date must be in YYYY-MM-DD format (e.g. 2015-03-28)')
267
+		args.after = [ int(m.group(1)), int(m.group(2)), int(m.group(3)) ]
268
+	if args.raw:
269
+		if getattr(args, 'from') is not None or \
270
+			getattr(args, 'to') is not None or \
271
+			args.subject is not None or \
272
+			args.before is not None or \
273
+			args.after is not None:
274
+			print('Warning: Cannot search header fields in raw mode. Ignoring.', file=sys.stderr)
275
+	if args.dir is None:
276
+		args.dir = [ '.' ]
277
+	else:
278
+		for d in args.dir:
279
+			if not os.path.exists(d) or not os.path.isdir(d):
280
+				parser.error(f'search path \'{d}\' does not exist or is not a directory')
281
+	if args.output is None:
282
+		args.output = TemporaryDirectory(prefix='Email search results (id ', suffix=')').name
283
+	else:
284
+		if not os.path.exists(args.output) or not os.path.isdir(args.output):
285
+			parser.error(f'output path \'{args.output}\' does not exist or is not a directory')
194 286
 
195
-def validate_arguments():
196
-	if filter is None:
197
-		print('No filter specified')
198
-		sys.exit(3)
199
-	pass
287
+def construct_filter():
288
+	global filter
289
+	criteria: List[Filter] = []
290
+	keyword_filters = map(lambda k : BodyKeywordFilter(k, case_sensitive=args.casesensitive), args.keywords)
291
+	criteria.append(BooleanFilter(BooleanOperator.or_op if args.any else BooleanOperator.and_op, keyword_filters))
292
+	if getattr(args, 'from') is not None:
293
+		criteria.append(HeaderFilter('from', getattr(args, 'from')))
294
+	if getattr(args, 'to') is not None:
295
+		criteria.append(HeaderFilter(['to', 'cc', 'bcc'], getattr(args, 'to')))
296
+	if args.subject is not None:
297
+		criteria.append(HeaderFilter('subject', args.subject))
298
+	# TODO: Dates
299
+	filter = BooleanFilter(BooleanOperator.and_op, criteria)
200 300
 
201 301
 def handle_results():
202 302
 	"""Final logic after all searching is completed."""
@@ -215,6 +315,6 @@ def handle_results():
215 315
 
216 316
 # Main logic
217 317
 parse_arguments()
218
-validate_arguments()
318
+construct_filter()
219 319
 walk_directory(start_path)
220 320
 handle_results()

Loading…
取消
儲存