Coverage for pyparse.py: 98%
286 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-11 13:22 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-11 13:22 -0700
1"""Define partial Python code Parser used by editor and hyperparser.
3Instances of ParseMap are used with str.translate.
5The following bound search and match functions are defined:
6_synchre - start of popular statement;
7_junkre - whitespace or comment line;
8_match_stringre: string, possibly without closer;
9_itemre - line that may have bracket structure start;
10_closere - line that must be followed by dedent.
11_chew_ordinaryre - non-special characters.
12"""
13import re
15# Reason last statement is continued (or C_NONE if it's not).
16(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
17 C_STRING_NEXT_LINES, C_BRACKET) = range(5)
19# Find what looks like the start of a popular statement.
21_synchre = re.compile(r"""
22 ^
23 [ \t]*
24 (?: while
25 | else
26 | def
27 | return
28 | assert
29 | break
30 | class
31 | continue
32 | elif
33 | try
34 | except
35 | raise
36 | import
37 | yield
38 )
39 \b
40""", re.VERBOSE | re.MULTILINE).search
42# Match blank line or non-indenting comment line.
44_junkre = re.compile(r"""
45 [ \t]*
46 (?: \# \S .* )?
47 \n
48""", re.VERBOSE).match
50# Match any flavor of string; the terminating quote is optional
51# so that we're robust in the face of incomplete program text.
53_match_stringre = re.compile(r"""
54 \""" [^"\\]* (?:
55 (?: \\. | "(?!"") )
56 [^"\\]*
57 )*
58 (?: \""" )?
60| " [^"\\\n]* (?: \\. [^"\\\n]* )* "?
62| ''' [^'\\]* (?:
63 (?: \\. | '(?!'') )
64 [^'\\]*
65 )*
66 (?: ''' )?
68| ' [^'\\\n]* (?: \\. [^'\\\n]* )* '?
69""", re.VERBOSE | re.DOTALL).match
71# Match a line that starts with something interesting;
72# used to find the first item of a bracket structure.
74_itemre = re.compile(r"""
75 [ \t]*
76 [^\s#\\] # if we match, m.end()-1 is the interesting char
77""", re.VERBOSE).match
79# Match start of statements that should be followed by a dedent.
81_closere = re.compile(r"""
82 \s*
83 (?: return
84 | break
85 | continue
86 | raise
87 | pass
88 )
89 \b
90""", re.VERBOSE).match
92# Chew up non-special chars as quickly as possible. If match is
93# successful, m.end() less 1 is the index of the last boring char
94# matched. If match is unsuccessful, the string starts with an
95# interesting char.
97_chew_ordinaryre = re.compile(r"""
98 [^[\](){}#'"\\]+
99""", re.VERBOSE).match
102class ParseMap(dict):
103 r"""Dict subclass that maps anything not in dict to 'x'.
105 This is designed to be used with str.translate in study1.
106 Anything not specifically mapped otherwise becomes 'x'.
107 Example: replace everything except whitespace with 'x'.
109 >>> keepwhite = ParseMap((ord(c), ord(c)) for c in ' \t\n\r')
110 >>> "a + b\tc\nd".translate(keepwhite)
111 'x x x\tx\nx'
112 """
113 # Calling this triples access time; see bpo-32940
114 def __missing__(self, key):
115 return 120 # ord('x') 1t
118# Map all ascii to 120 to avoid __missing__ call, then replace some.
119trans = ParseMap.fromkeys(range(128), 120)
120trans.update((ord(c), ord('(')) for c in "({[") # open brackets => '(';
121trans.update((ord(c), ord(')')) for c in ")}]") # close brackets => ')'.
122trans.update((ord(c), ord(c)) for c in "\"'\\\n#") # Keep these.
125class Parser:
127 def __init__(self, indentwidth, tabwidth):
128 self.indentwidth = indentwidth 1ohijfs
129 self.tabwidth = tabwidth 1ohijfs
131 def set_code(self, s):
132 assert len(s) == 0 or s[-1] == '\n' 1hijfacpgnkledrqmb
133 self.code = s 1hijfacpgnkledrqmb
134 self.study_level = 0 1hijfacpgnkledrqmb
136 def find_good_parse_start(self, is_char_in_string):
137 """
138 Return index of a good place to begin parsing, as close to the
139 end of the string as possible. This will be the start of some
140 popular stmt like "if" or "def". Return None if none found:
141 the caller should pass more prior context then, if possible, or
142 if not (the entire program text up until the point of interest
143 has already been tried) pass 0 to set_lo().
145 This will be reliable iff given a reliable is_char_in_string()
146 function, meaning that when it says "no", it's absolutely
147 guaranteed that the char is not in a string.
148 """
149 code, pos = self.code, None 1p
151 # Peek back from the end for a good place to start,
152 # but don't try too often; pos will be left None, or
153 # bumped to a legitimate synch point.
154 limit = len(code) 1p
155 for tries in range(5): 155 ↛ 165line 155 didn't jump to line 165, because the loop on line 155 didn't complete1p
156 i = code.rfind(":\n", 0, limit) 1p
157 if i < 0: 1p
158 break 1p
159 i = code.rfind('\n', 0, i) + 1 # start of colon line (-1+1=0) 1p
160 m = _synchre(code, i, limit) 1p
161 if m and not is_char_in_string(m.start()): 1p
162 pos = m.start() 1p
163 break 1p
164 limit = i 1p
165 if pos is None: 1p
166 # Nothing looks like a block-opener, or stuff does
167 # but is_char_in_string keeps returning true; most likely
168 # we're in or near a giant string, the colorizer hasn't
169 # caught up enough to be helpful, or there simply *aren't*
170 # any interesting stmts. In any of these cases we're
171 # going to have to parse the whole thing to be sure, so
172 # give it one last try from the start, but stop wasting
173 # time here regardless of the outcome.
174 m = _synchre(code) 1p
175 if m and not is_char_in_string(m.start()): 1p
176 pos = m.start() 1p
177 return pos 1p
179 # Peeking back worked; look forward until _synchre no longer
180 # matches.
181 i = pos + 1 1p
182 while m := _synchre(code, i): 1p
183 s, i = m.span() 1p
184 if not is_char_in_string(s): 1p
185 pos = s 1p
186 return pos 1p
188 def set_lo(self, lo):
189 """ Throw away the start of the string.
191 Intended to be called with the result of find_good_parse_start().
192 """
193 assert lo == 0 or self.code[lo-1] == '\n' 1hijfq
194 if lo > 0: 1hijfq
195 self.code = self.code[lo:] 1q
197 def _study1(self):
198 """Find the line numbers of non-continuation lines.
200 As quickly as humanly possible <wink>, find the line numbers (0-
201 based) of the non-continuation lines.
202 Creates self.{goodlines, continuation}.
203 """
204 if self.study_level >= 1: 1hijfacgnkledmb
205 return 1m
206 self.study_level = 1 1hijfacgnkledmb
208 # Map all uninteresting characters to "x", all open brackets
209 # to "(", all close brackets to ")", then collapse runs of
210 # uninteresting characters. This can cut the number of chars
211 # by a factor of 10-40, and so greatly speed the following loop.
212 code = self.code 1hijfacgnkledmb
213 code = code.translate(trans) 1hijfacgnkledmb
214 code = code.replace('xxxxxxxx', 'x') 1hijfacgnkledmb
215 code = code.replace('xxxx', 'x') 1hijfacgnkledmb
216 code = code.replace('xx', 'x') 1hijfacgnkledmb
217 code = code.replace('xx', 'x') 1hijfacgnkledmb
218 code = code.replace('\nx', '\n') 1hijfacgnkledmb
219 # Replacing x\n with \n would be incorrect because
220 # x may be preceded by a backslash.
222 # March over the squashed version of the program, accumulating
223 # the line numbers of non-continued stmts, and determining
224 # whether & why the last stmt is a continuation.
225 continuation = C_NONE 1hijfacgnkledmb
226 level = lno = 0 # level is nesting level; lno is line number 1hijfacgnkledmb
227 self.goodlines = goodlines = [0] 1hijfacgnkledmb
228 push_good = goodlines.append 1hijfacgnkledmb
229 i, n = 0, len(code) 1hijfacgnkledmb
230 while i < n: 1hijfacgnkledmb
231 ch = code[i] 1hijfacgnkledmb
232 i = i+1 1hijfacgnkledmb
234 # cases are checked in decreasing order of frequency
235 if ch == 'x': 1hijfacgnkledmb
236 continue 1hijfacgnkledmb
238 if ch == '\n': 1hijfacgnkledmb
239 lno = lno + 1 1hijfacgnkledmb
240 if level == 0: 1hijfacgnkledmb
241 push_good(lno) 1hijfacgnkledmb
242 # else we're in an unclosed bracket structure
243 continue 1hijfacgnkledmb
245 if ch == '(': 1hijfacgnkledmb
246 level = level + 1 1hijfacgnkledmb
247 continue 1hijfacgnkledmb
249 if ch == ')': 1hijfacgnkledmb
250 if level: 1hijfacgnkledmb
251 level = level - 1 1hijfacgnkledmb
252 # else the program is invalid, but we can't complain
253 continue 1hijfacgnkledmb
255 if ch == '"' or ch == "'": 1hfacgnledmb
256 # consume the string
257 quote = ch 1facnledmb
258 if code[i-1:i+2] == quote * 3: 1facnledmb
259 quote = quote * 3 1anledmb
260 firstlno = lno 1facnledmb
261 w = len(quote) - 1 1facnledmb
262 i = i+w 1facnledmb
263 while i < n: 1facnledmb
264 ch = code[i] 1facnledmb
265 i = i+1 1facnledmb
267 if ch == 'x': 1facnledmb
268 continue 1acnledmb
270 if code[i-1:i+w] == quote: 1facnledmb
271 i = i+w 1faclmb
272 break 1faclmb
274 if ch == '\n': 1fanledmb
275 lno = lno + 1 1nledmb
276 if w == 0: 1nledmb
277 # unterminated single-quoted string
278 if level == 0: 278 ↛ 280line 278 didn't jump to line 280, because the condition on line 278 was never false1edm
279 push_good(lno) 1edm
280 break 1edm
281 continue 1nledmb
283 if ch == '\\': 1falm
284 assert i < n 1al
285 if code[i] == '\n': 285 ↛ 287line 285 didn't jump to line 287, because the condition on line 285 was never false1al
286 lno = lno + 1 1al
287 i = i+1 1al
288 continue 1al
290 # else comment char or paren inside string
292 else:
293 # didn't break out of the loop, so we're still
294 # inside a string
295 if (lno - 1) == firstlno: 1anledmb
296 # before the previous \n in code, we were in the first
297 # line of the string
298 continuation = C_STRING_FIRST_LINE 1anedm
299 else:
300 continuation = C_STRING_NEXT_LINES 1nlmb
301 continue # with outer loop 1facnledmb
303 if ch == '#': 1hacgnledmb
304 # consume the comment
305 i = code.find('\n', i) 1hacgedmb
306 assert i >= 0 1hacgedmb
307 continue 1hacgedmb
309 assert ch == '\\' 1anlmb
310 assert i < n 1anlmb
311 if code[i] == '\n': 1anlmb
312 lno = lno + 1 1anlmb
313 if i+1 == n: 1anlmb
314 continuation = C_BACKSLASH 1anlmb
315 i = i+1 1anlmb
317 # The last stmt may be continued for all 3 reasons.
318 # String continuation takes precedence over bracket
319 # continuation, which beats backslash continuation.
320 if (continuation != C_STRING_FIRST_LINE 1hijfacgnkledmb
321 and continuation != C_STRING_NEXT_LINES and level > 0):
322 continuation = C_BRACKET 1hijfacgnkldmb
323 self.continuation = continuation 1hijfacgnkledmb
325 # Push the final line number as a sentinel value, regardless of
326 # whether it's continued.
327 assert (continuation == C_NONE) == (goodlines[-1] == lno) 1hijfacgnkledmb
328 if goodlines[-1] != lno: 1hijfacgnkledmb
329 push_good(lno) 1hijfacgnkledmb
331 def get_continuation_type(self):
332 self._study1() 1n
333 return self.continuation 1n
335 def _study2(self):
336 """
337 study1 was sufficient to determine the continuation status,
338 but doing more requires looking at every character. study2
339 does this for the last interesting statement in the block.
340 Creates:
341 self.stmt_start, stmt_end
342 slice indices of last interesting stmt
343 self.stmt_bracketing
344 the bracketing structure of the last interesting stmt; for
345 example, for the statement "say(boo) or die",
346 stmt_bracketing will be ((0, 0), (0, 1), (2, 0), (2, 1),
347 (4, 0)). Strings and comments are treated as brackets, for
348 the matter.
349 self.lastch
350 last interesting character before optional trailing comment
351 self.lastopenbracketpos
352 if continuation is C_BRACKET, index of last open bracket
353 """
354 if self.study_level >= 2: 1hijfacgkedb
355 return 1b
356 self._study1() 1hijfacgkedb
357 self.study_level = 2 1hijfacgkedb
359 # Set p and q to slice indices of last interesting stmt.
360 code, goodlines = self.code, self.goodlines 1hijfacgkedb
361 i = len(goodlines) - 1 # Index of newest line. 1hijfacgkedb
362 p = len(code) # End of goodlines[i] 1hijfacgkedb
363 while i: 1hijfacgkedb
364 assert p 1hijfacgkedb
365 # Make p be the index of the stmt at line number goodlines[i].
366 # Move p back to the stmt at line number goodlines[i-1].
367 q = p 1hijfacgkedb
368 for nothing in range(goodlines[i-1], goodlines[i]): 1hijfacgkedb
369 # tricky: sets p to 0 if no preceding newline
370 p = code.rfind('\n', 0, p-1) + 1 1hijfacgkedb
371 # The stmt code[p:q] isn't a continuation, but may be blank
372 # or a non-indenting comment line.
373 if _junkre(code, p): 1hijfacgkedb
374 i = i-1 1b
375 else:
376 break 1hijfacgkedb
377 if i == 0: 1hijfacgkedb
378 # nothing but junk!
379 assert p == 0 1gkb
380 q = p 1gkb
381 self.stmt_start, self.stmt_end = p, q 1hijfacgkedb
383 # Analyze this stmt, to find the last open bracket (if any)
384 # and last interesting character (if any).
385 lastch = "" 1hijfacgkedb
386 stack = [] # stack of open bracket indices 1hijfacgkedb
387 push_stack = stack.append 1hijfacgkedb
388 bracketing = [(p, 0)] 1hijfacgkedb
389 while p < q: 1hijfacgkedb
390 # suck up all except ()[]{}'"#\\
391 m = _chew_ordinaryre(code, p, q) 1hijfacgkedb
392 if m: 1hijfacgkedb
393 # we skipped at least one boring char
394 newp = m.end() 1hijfacgkedb
395 # back up over totally boring whitespace
396 i = newp - 1 # index of last boring char 1hijfacgkedb
397 while i >= p and code[i] in " \t\n": 1hijfacgkedb
398 i = i-1 1hijfacgkedb
399 if i >= p: 1hijfacgkedb
400 lastch = code[i] 1hijfacgkedb
401 p = newp 1hijfacgkedb
402 if p >= q: 1hijfacgkedb
403 break 1hijfcgkedb
405 ch = code[p] 1hijfacgkedb
407 if ch in "([{": 1hijfacgkedb
408 push_stack(p) 1hijfacgkdb
409 bracketing.append((p, len(stack))) 1hijfacgkdb
410 lastch = ch 1hijfacgkdb
411 p = p+1 1hijfacgkdb
412 continue 1hijfacgkdb
414 if ch in ")]}": 1hijfacgkedb
415 if stack: 1hijfacgkdb
416 del stack[-1] 1hijfacgkdb
417 lastch = ch 1hijfacgkdb
418 p = p+1 1hijfacgkdb
419 bracketing.append((p, len(stack))) 1hijfacgkdb
420 continue 1hijfacgkdb
422 if ch == '"' or ch == "'": 1hfacgedb
423 # consume string
424 # Note that study1 did this with a Python loop, but
425 # we use a regexp here; the reason is speed in both
426 # cases; the string may be huge, but study1 pre-squashed
427 # strings to a couple of characters per line. study1
428 # also needed to keep track of newlines, and we don't
429 # have to.
430 bracketing.append((p, len(stack)+1)) 1facedb
431 lastch = ch 1facedb
432 p = _match_stringre(code, p, q).end() 1facedb
433 bracketing.append((p, len(stack))) 1facedb
434 continue 1facedb
436 if ch == '#': 1hacgedb
437 # consume comment and trailing newline
438 bracketing.append((p, len(stack)+1)) 1hacgedb
439 p = code.find('\n', p, q) + 1 1hacgedb
440 assert p > 0 1hacgedb
441 bracketing.append((p, len(stack))) 1hacgedb
442 continue 1hacgedb
444 assert ch == '\\' 1ab
445 p = p+1 # beyond backslash 1ab
446 assert p < q 1ab
447 if code[p] != '\n': 1ab
448 # the program is invalid, but can't complain
449 lastch = ch + code[p] 1b
450 p = p+1 # beyond escaped char 1ab
452 # end while p < q:
454 self.lastch = lastch 1hijfacgkedb
455 self.lastopenbracketpos = stack[-1] if stack else None 1hijfacgkedb
456 self.stmt_bracketing = tuple(bracketing) 1hijfacgkedb
458 def compute_bracket_indent(self):
459 """Return number of spaces the next line should be indented.
461 Line continuation must be C_BRACKET.
462 """
463 self._study2() 1c
464 assert self.continuation == C_BRACKET 1c
465 j = self.lastopenbracketpos 1c
466 code = self.code 1c
467 n = len(code) 1c
468 origi = i = code.rfind('\n', 0, j) + 1 1c
469 j = j+1 # one beyond open bracket 1c
470 # find first list item; set i to start of its line
471 while j < n: 1c
472 m = _itemre(code, j) 1c
473 if m: 1c
474 j = m.end() - 1 # index of first interesting char 1c
475 extra = 0 1c
476 break 1c
477 else:
478 # this line is junk; advance to next line
479 i = j = code.find('\n', j) + 1 1c
480 else:
481 # nothing interesting follows the bracket;
482 # reproduce the bracket line's indentation + a level
483 j = i = origi 1c
484 while code[j] in " \t": 1c
485 j = j+1 1c
486 extra = self.indentwidth 1c
487 return len(code[i:j].expandtabs(self.tabwidth)) + extra 1c
489 def get_num_lines_in_stmt(self):
490 """Return number of physical lines in last stmt.
492 The statement doesn't have to be an interesting statement. This is
493 intended to be called when continuation is C_BACKSLASH.
494 """
495 self._study1() 1l
496 goodlines = self.goodlines 1l
497 return goodlines[-1] - goodlines[-2] 1l
499 def compute_backslash_indent(self):
500 """Return number of spaces the next line should be indented.
502 Line continuation must be C_BACKSLASH. Also assume that the new
503 line is the first one following the initial line of the stmt.
504 """
505 self._study2() 1a
506 assert self.continuation == C_BACKSLASH 1a
507 code = self.code 1a
508 i = self.stmt_start 1a
509 while code[i] in " \t": 1a
510 i = i+1 1a
511 startpos = i 1a
513 # See whether the initial line starts an assignment stmt; i.e.,
514 # look for an = operator
515 endpos = code.find('\n', startpos) + 1 1a
516 found = level = 0 1a
517 while i < endpos: 1a
518 ch = code[i] 1a
519 if ch in "([{": 1a
520 level = level + 1 1a
521 i = i+1 1a
522 elif ch in ")]}": 1a
523 if level: 523 ↛ 525line 523 didn't jump to line 525, because the condition on line 523 was never false1a
524 level = level - 1 1a
525 i = i+1 1a
526 elif ch == '"' or ch == "'": 1a
527 i = _match_stringre(code, i, endpos).end() 1a
528 elif ch == '#': 528 ↛ 531line 528 didn't jump to line 531, because the condition on line 528 was never true1a
529 # This line is unreachable because the # makes a comment of
530 # everything after it.
531 break
532 elif level == 0 and ch == '=' and \ 1a
533 (i == 0 or code[i-1] not in "=<>!") and \
534 code[i+1] != '=':
535 found = 1 1a
536 break 1a
537 else:
538 i = i+1 1a
540 if found: 1a
541 # found a legit =, but it may be the last interesting
542 # thing on the line
543 i = i+1 # move beyond the = 1a
544 found = re.match(r"\s*\\", code[i:endpos]) is None 1a
546 if not found: 1a
547 # oh well ... settle for moving beyond the first chunk
548 # of non-whitespace chars
549 i = startpos 1a
550 while code[i] not in " \t\n": 1a
551 i = i+1 1a
553 return len(code[self.stmt_start:i].expandtabs(\ 1a
554 self.tabwidth)) + 1
556 def get_base_indent_string(self):
557 """Return the leading whitespace on the initial line of the last
558 interesting stmt.
559 """
560 self._study2() 1g
561 i, n = self.stmt_start, self.stmt_end 1g
562 j = i 1g
563 code = self.code 1g
564 while j < n and code[j] in " \t": 1g
565 j = j + 1 1g
566 return code[i:j] 1g
568 def is_block_opener(self):
569 "Return True if the last interesting statement opens a block."
570 self._study2() 1d
571 return self.lastch == ':' 1d
573 def is_block_closer(self):
574 "Return True if the last interesting statement closes a block."
575 self._study2() 1e
576 return _closere(self.code, self.stmt_start) is not None 1e
578 def get_last_stmt_bracketing(self):
579 """Return bracketing structure of the last interesting statement.
581 The returned tuple is in the format defined in _study2().
582 """
583 self._study2() 1hijfk
584 return self.stmt_bracketing 1hijfk
587if __name__ == '__main__': 587 ↛ 588line 587 didn't jump to line 588, because the condition on line 587 was never true
588 from unittest import main
589 main('idlelib.idle_test.test_pyparse', verbosity=2)