Coverage for pyparse.py: 98%

286 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-11 13:22 -0700

1"""Define partial Python code Parser used by editor and hyperparser. 

2 

3Instances of ParseMap are used with str.translate. 

4 

5The following bound search and match functions are defined: 

6_synchre - start of popular statement; 

7_junkre - whitespace or comment line; 

8_match_stringre: string, possibly without closer; 

9_itemre - line that may have bracket structure start; 

10_closere - line that must be followed by dedent. 

11_chew_ordinaryre - non-special characters. 

12""" 

13import re 

14 

15# Reason last statement is continued (or C_NONE if it's not). 

16(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE, 

17 C_STRING_NEXT_LINES, C_BRACKET) = range(5) 

18 

19# Find what looks like the start of a popular statement. 

20 

21_synchre = re.compile(r""" 

22 ^ 

23 [ \t]* 

24 (?: while 

25 | else 

26 | def 

27 | return 

28 | assert 

29 | break 

30 | class 

31 | continue 

32 | elif 

33 | try 

34 | except 

35 | raise 

36 | import 

37 | yield 

38 ) 

39 \b 

40""", re.VERBOSE | re.MULTILINE).search 

41 

42# Match blank line or non-indenting comment line. 

43 

44_junkre = re.compile(r""" 

45 [ \t]* 

46 (?: \# \S .* )? 

47 \n 

48""", re.VERBOSE).match 

49 

50# Match any flavor of string; the terminating quote is optional 

51# so that we're robust in the face of incomplete program text. 

52 

53_match_stringre = re.compile(r""" 

54 \""" [^"\\]* (?: 

55 (?: \\. | "(?!"") ) 

56 [^"\\]* 

57 )* 

58 (?: \""" )? 

59 

60| " [^"\\\n]* (?: \\. [^"\\\n]* )* "? 

61 

62| ''' [^'\\]* (?: 

63 (?: \\. | '(?!'') ) 

64 [^'\\]* 

65 )* 

66 (?: ''' )? 

67 

68| ' [^'\\\n]* (?: \\. [^'\\\n]* )* '? 

69""", re.VERBOSE | re.DOTALL).match 

70 

71# Match a line that starts with something interesting; 

72# used to find the first item of a bracket structure. 

73 

74_itemre = re.compile(r""" 

75 [ \t]* 

76 [^\s#\\] # if we match, m.end()-1 is the interesting char 

77""", re.VERBOSE).match 

78 

79# Match start of statements that should be followed by a dedent. 

80 

81_closere = re.compile(r""" 

82 \s* 

83 (?: return 

84 | break 

85 | continue 

86 | raise 

87 | pass 

88 ) 

89 \b 

90""", re.VERBOSE).match 

91 

92# Chew up non-special chars as quickly as possible. If match is 

93# successful, m.end() less 1 is the index of the last boring char 

94# matched. If match is unsuccessful, the string starts with an 

95# interesting char. 

96 

97_chew_ordinaryre = re.compile(r""" 

98 [^[\](){}#'"\\]+ 

99""", re.VERBOSE).match 

100 

101 

102class ParseMap(dict): 

103 r"""Dict subclass that maps anything not in dict to 'x'. 

104 

105 This is designed to be used with str.translate in study1. 

106 Anything not specifically mapped otherwise becomes 'x'. 

107 Example: replace everything except whitespace with 'x'. 

108 

109 >>> keepwhite = ParseMap((ord(c), ord(c)) for c in ' \t\n\r') 

110 >>> "a + b\tc\nd".translate(keepwhite) 

111 'x x x\tx\nx' 

112 """ 

113 # Calling this triples access time; see bpo-32940 

114 def __missing__(self, key): 

115 return 120 # ord('x') 1t

116 

117 

118# Map all ascii to 120 to avoid __missing__ call, then replace some. 

119trans = ParseMap.fromkeys(range(128), 120) 

120trans.update((ord(c), ord('(')) for c in "({[") # open brackets => '('; 

121trans.update((ord(c), ord(')')) for c in ")}]") # close brackets => ')'. 

122trans.update((ord(c), ord(c)) for c in "\"'\\\n#") # Keep these. 

123 

124 

125class Parser: 

126 

127 def __init__(self, indentwidth, tabwidth): 

128 self.indentwidth = indentwidth 1ohijfs

129 self.tabwidth = tabwidth 1ohijfs

130 

131 def set_code(self, s): 

132 assert len(s) == 0 or s[-1] == '\n' 1hijfacpgnkledrqmb

133 self.code = s 1hijfacpgnkledrqmb

134 self.study_level = 0 1hijfacpgnkledrqmb

135 

136 def find_good_parse_start(self, is_char_in_string): 

137 """ 

138 Return index of a good place to begin parsing, as close to the 

139 end of the string as possible. This will be the start of some 

140 popular stmt like "if" or "def". Return None if none found: 

141 the caller should pass more prior context then, if possible, or 

142 if not (the entire program text up until the point of interest 

143 has already been tried) pass 0 to set_lo(). 

144 

145 This will be reliable iff given a reliable is_char_in_string() 

146 function, meaning that when it says "no", it's absolutely 

147 guaranteed that the char is not in a string. 

148 """ 

149 code, pos = self.code, None 1p

150 

151 # Peek back from the end for a good place to start, 

152 # but don't try too often; pos will be left None, or 

153 # bumped to a legitimate synch point. 

154 limit = len(code) 1p

155 for tries in range(5): 155 ↛ 165line 155 didn't jump to line 165, because the loop on line 155 didn't complete1p

156 i = code.rfind(":\n", 0, limit) 1p

157 if i < 0: 1p

158 break 1p

159 i = code.rfind('\n', 0, i) + 1 # start of colon line (-1+1=0) 1p

160 m = _synchre(code, i, limit) 1p

161 if m and not is_char_in_string(m.start()): 1p

162 pos = m.start() 1p

163 break 1p

164 limit = i 1p

165 if pos is None: 1p

166 # Nothing looks like a block-opener, or stuff does 

167 # but is_char_in_string keeps returning true; most likely 

168 # we're in or near a giant string, the colorizer hasn't 

169 # caught up enough to be helpful, or there simply *aren't* 

170 # any interesting stmts. In any of these cases we're 

171 # going to have to parse the whole thing to be sure, so 

172 # give it one last try from the start, but stop wasting 

173 # time here regardless of the outcome. 

174 m = _synchre(code) 1p

175 if m and not is_char_in_string(m.start()): 1p

176 pos = m.start() 1p

177 return pos 1p

178 

179 # Peeking back worked; look forward until _synchre no longer 

180 # matches. 

181 i = pos + 1 1p

182 while m := _synchre(code, i): 1p

183 s, i = m.span() 1p

184 if not is_char_in_string(s): 1p

185 pos = s 1p

186 return pos 1p

187 

188 def set_lo(self, lo): 

189 """ Throw away the start of the string. 

190 

191 Intended to be called with the result of find_good_parse_start(). 

192 """ 

193 assert lo == 0 or self.code[lo-1] == '\n' 1hijfq

194 if lo > 0: 1hijfq

195 self.code = self.code[lo:] 1q

196 

197 def _study1(self): 

198 """Find the line numbers of non-continuation lines. 

199 

200 As quickly as humanly possible <wink>, find the line numbers (0- 

201 based) of the non-continuation lines. 

202 Creates self.{goodlines, continuation}. 

203 """ 

204 if self.study_level >= 1: 1hijfacgnkledmb

205 return 1m

206 self.study_level = 1 1hijfacgnkledmb

207 

208 # Map all uninteresting characters to "x", all open brackets 

209 # to "(", all close brackets to ")", then collapse runs of 

210 # uninteresting characters. This can cut the number of chars 

211 # by a factor of 10-40, and so greatly speed the following loop. 

212 code = self.code 1hijfacgnkledmb

213 code = code.translate(trans) 1hijfacgnkledmb

214 code = code.replace('xxxxxxxx', 'x') 1hijfacgnkledmb

215 code = code.replace('xxxx', 'x') 1hijfacgnkledmb

216 code = code.replace('xx', 'x') 1hijfacgnkledmb

217 code = code.replace('xx', 'x') 1hijfacgnkledmb

218 code = code.replace('\nx', '\n') 1hijfacgnkledmb

219 # Replacing x\n with \n would be incorrect because 

220 # x may be preceded by a backslash. 

221 

222 # March over the squashed version of the program, accumulating 

223 # the line numbers of non-continued stmts, and determining 

224 # whether & why the last stmt is a continuation. 

225 continuation = C_NONE 1hijfacgnkledmb

226 level = lno = 0 # level is nesting level; lno is line number 1hijfacgnkledmb

227 self.goodlines = goodlines = [0] 1hijfacgnkledmb

228 push_good = goodlines.append 1hijfacgnkledmb

229 i, n = 0, len(code) 1hijfacgnkledmb

230 while i < n: 1hijfacgnkledmb

231 ch = code[i] 1hijfacgnkledmb

232 i = i+1 1hijfacgnkledmb

233 

234 # cases are checked in decreasing order of frequency 

235 if ch == 'x': 1hijfacgnkledmb

236 continue 1hijfacgnkledmb

237 

238 if ch == '\n': 1hijfacgnkledmb

239 lno = lno + 1 1hijfacgnkledmb

240 if level == 0: 1hijfacgnkledmb

241 push_good(lno) 1hijfacgnkledmb

242 # else we're in an unclosed bracket structure 

243 continue 1hijfacgnkledmb

244 

245 if ch == '(': 1hijfacgnkledmb

246 level = level + 1 1hijfacgnkledmb

247 continue 1hijfacgnkledmb

248 

249 if ch == ')': 1hijfacgnkledmb

250 if level: 1hijfacgnkledmb

251 level = level - 1 1hijfacgnkledmb

252 # else the program is invalid, but we can't complain 

253 continue 1hijfacgnkledmb

254 

255 if ch == '"' or ch == "'": 1hfacgnledmb

256 # consume the string 

257 quote = ch 1facnledmb

258 if code[i-1:i+2] == quote * 3: 1facnledmb

259 quote = quote * 3 1anledmb

260 firstlno = lno 1facnledmb

261 w = len(quote) - 1 1facnledmb

262 i = i+w 1facnledmb

263 while i < n: 1facnledmb

264 ch = code[i] 1facnledmb

265 i = i+1 1facnledmb

266 

267 if ch == 'x': 1facnledmb

268 continue 1acnledmb

269 

270 if code[i-1:i+w] == quote: 1facnledmb

271 i = i+w 1faclmb

272 break 1faclmb

273 

274 if ch == '\n': 1fanledmb

275 lno = lno + 1 1nledmb

276 if w == 0: 1nledmb

277 # unterminated single-quoted string 

278 if level == 0: 278 ↛ 280line 278 didn't jump to line 280, because the condition on line 278 was never false1edm

279 push_good(lno) 1edm

280 break 1edm

281 continue 1nledmb

282 

283 if ch == '\\': 1falm

284 assert i < n 1al

285 if code[i] == '\n': 285 ↛ 287line 285 didn't jump to line 287, because the condition on line 285 was never false1al

286 lno = lno + 1 1al

287 i = i+1 1al

288 continue 1al

289 

290 # else comment char or paren inside string 

291 

292 else: 

293 # didn't break out of the loop, so we're still 

294 # inside a string 

295 if (lno - 1) == firstlno: 1anledmb

296 # before the previous \n in code, we were in the first 

297 # line of the string 

298 continuation = C_STRING_FIRST_LINE 1anedm

299 else: 

300 continuation = C_STRING_NEXT_LINES 1nlmb

301 continue # with outer loop 1facnledmb

302 

303 if ch == '#': 1hacgnledmb

304 # consume the comment 

305 i = code.find('\n', i) 1hacgedmb

306 assert i >= 0 1hacgedmb

307 continue 1hacgedmb

308 

309 assert ch == '\\' 1anlmb

310 assert i < n 1anlmb

311 if code[i] == '\n': 1anlmb

312 lno = lno + 1 1anlmb

313 if i+1 == n: 1anlmb

314 continuation = C_BACKSLASH 1anlmb

315 i = i+1 1anlmb

316 

317 # The last stmt may be continued for all 3 reasons. 

318 # String continuation takes precedence over bracket 

319 # continuation, which beats backslash continuation. 

320 if (continuation != C_STRING_FIRST_LINE 1hijfacgnkledmb

321 and continuation != C_STRING_NEXT_LINES and level > 0): 

322 continuation = C_BRACKET 1hijfacgnkldmb

323 self.continuation = continuation 1hijfacgnkledmb

324 

325 # Push the final line number as a sentinel value, regardless of 

326 # whether it's continued. 

327 assert (continuation == C_NONE) == (goodlines[-1] == lno) 1hijfacgnkledmb

328 if goodlines[-1] != lno: 1hijfacgnkledmb

329 push_good(lno) 1hijfacgnkledmb

330 

331 def get_continuation_type(self): 

332 self._study1() 1n

333 return self.continuation 1n

334 

335 def _study2(self): 

336 """ 

337 study1 was sufficient to determine the continuation status, 

338 but doing more requires looking at every character. study2 

339 does this for the last interesting statement in the block. 

340 Creates: 

341 self.stmt_start, stmt_end 

342 slice indices of last interesting stmt 

343 self.stmt_bracketing 

344 the bracketing structure of the last interesting stmt; for 

345 example, for the statement "say(boo) or die", 

346 stmt_bracketing will be ((0, 0), (0, 1), (2, 0), (2, 1), 

347 (4, 0)). Strings and comments are treated as brackets, for 

348 the matter. 

349 self.lastch 

350 last interesting character before optional trailing comment 

351 self.lastopenbracketpos 

352 if continuation is C_BRACKET, index of last open bracket 

353 """ 

354 if self.study_level >= 2: 1hijfacgkedb

355 return 1b

356 self._study1() 1hijfacgkedb

357 self.study_level = 2 1hijfacgkedb

358 

359 # Set p and q to slice indices of last interesting stmt. 

360 code, goodlines = self.code, self.goodlines 1hijfacgkedb

361 i = len(goodlines) - 1 # Index of newest line. 1hijfacgkedb

362 p = len(code) # End of goodlines[i] 1hijfacgkedb

363 while i: 1hijfacgkedb

364 assert p 1hijfacgkedb

365 # Make p be the index of the stmt at line number goodlines[i]. 

366 # Move p back to the stmt at line number goodlines[i-1]. 

367 q = p 1hijfacgkedb

368 for nothing in range(goodlines[i-1], goodlines[i]): 1hijfacgkedb

369 # tricky: sets p to 0 if no preceding newline 

370 p = code.rfind('\n', 0, p-1) + 1 1hijfacgkedb

371 # The stmt code[p:q] isn't a continuation, but may be blank 

372 # or a non-indenting comment line. 

373 if _junkre(code, p): 1hijfacgkedb

374 i = i-1 1b

375 else: 

376 break 1hijfacgkedb

377 if i == 0: 1hijfacgkedb

378 # nothing but junk! 

379 assert p == 0 1gkb

380 q = p 1gkb

381 self.stmt_start, self.stmt_end = p, q 1hijfacgkedb

382 

383 # Analyze this stmt, to find the last open bracket (if any) 

384 # and last interesting character (if any). 

385 lastch = "" 1hijfacgkedb

386 stack = [] # stack of open bracket indices 1hijfacgkedb

387 push_stack = stack.append 1hijfacgkedb

388 bracketing = [(p, 0)] 1hijfacgkedb

389 while p < q: 1hijfacgkedb

390 # suck up all except ()[]{}'"#\\ 

391 m = _chew_ordinaryre(code, p, q) 1hijfacgkedb

392 if m: 1hijfacgkedb

393 # we skipped at least one boring char 

394 newp = m.end() 1hijfacgkedb

395 # back up over totally boring whitespace 

396 i = newp - 1 # index of last boring char 1hijfacgkedb

397 while i >= p and code[i] in " \t\n": 1hijfacgkedb

398 i = i-1 1hijfacgkedb

399 if i >= p: 1hijfacgkedb

400 lastch = code[i] 1hijfacgkedb

401 p = newp 1hijfacgkedb

402 if p >= q: 1hijfacgkedb

403 break 1hijfcgkedb

404 

405 ch = code[p] 1hijfacgkedb

406 

407 if ch in "([{": 1hijfacgkedb

408 push_stack(p) 1hijfacgkdb

409 bracketing.append((p, len(stack))) 1hijfacgkdb

410 lastch = ch 1hijfacgkdb

411 p = p+1 1hijfacgkdb

412 continue 1hijfacgkdb

413 

414 if ch in ")]}": 1hijfacgkedb

415 if stack: 1hijfacgkdb

416 del stack[-1] 1hijfacgkdb

417 lastch = ch 1hijfacgkdb

418 p = p+1 1hijfacgkdb

419 bracketing.append((p, len(stack))) 1hijfacgkdb

420 continue 1hijfacgkdb

421 

422 if ch == '"' or ch == "'": 1hfacgedb

423 # consume string 

424 # Note that study1 did this with a Python loop, but 

425 # we use a regexp here; the reason is speed in both 

426 # cases; the string may be huge, but study1 pre-squashed 

427 # strings to a couple of characters per line. study1 

428 # also needed to keep track of newlines, and we don't 

429 # have to. 

430 bracketing.append((p, len(stack)+1)) 1facedb

431 lastch = ch 1facedb

432 p = _match_stringre(code, p, q).end() 1facedb

433 bracketing.append((p, len(stack))) 1facedb

434 continue 1facedb

435 

436 if ch == '#': 1hacgedb

437 # consume comment and trailing newline 

438 bracketing.append((p, len(stack)+1)) 1hacgedb

439 p = code.find('\n', p, q) + 1 1hacgedb

440 assert p > 0 1hacgedb

441 bracketing.append((p, len(stack))) 1hacgedb

442 continue 1hacgedb

443 

444 assert ch == '\\' 1ab

445 p = p+1 # beyond backslash 1ab

446 assert p < q 1ab

447 if code[p] != '\n': 1ab

448 # the program is invalid, but can't complain 

449 lastch = ch + code[p] 1b

450 p = p+1 # beyond escaped char 1ab

451 

452 # end while p < q: 

453 

454 self.lastch = lastch 1hijfacgkedb

455 self.lastopenbracketpos = stack[-1] if stack else None 1hijfacgkedb

456 self.stmt_bracketing = tuple(bracketing) 1hijfacgkedb

457 

458 def compute_bracket_indent(self): 

459 """Return number of spaces the next line should be indented. 

460 

461 Line continuation must be C_BRACKET. 

462 """ 

463 self._study2() 1c

464 assert self.continuation == C_BRACKET 1c

465 j = self.lastopenbracketpos 1c

466 code = self.code 1c

467 n = len(code) 1c

468 origi = i = code.rfind('\n', 0, j) + 1 1c

469 j = j+1 # one beyond open bracket 1c

470 # find first list item; set i to start of its line 

471 while j < n: 1c

472 m = _itemre(code, j) 1c

473 if m: 1c

474 j = m.end() - 1 # index of first interesting char 1c

475 extra = 0 1c

476 break 1c

477 else: 

478 # this line is junk; advance to next line 

479 i = j = code.find('\n', j) + 1 1c

480 else: 

481 # nothing interesting follows the bracket; 

482 # reproduce the bracket line's indentation + a level 

483 j = i = origi 1c

484 while code[j] in " \t": 1c

485 j = j+1 1c

486 extra = self.indentwidth 1c

487 return len(code[i:j].expandtabs(self.tabwidth)) + extra 1c

488 

489 def get_num_lines_in_stmt(self): 

490 """Return number of physical lines in last stmt. 

491 

492 The statement doesn't have to be an interesting statement. This is 

493 intended to be called when continuation is C_BACKSLASH. 

494 """ 

495 self._study1() 1l

496 goodlines = self.goodlines 1l

497 return goodlines[-1] - goodlines[-2] 1l

498 

499 def compute_backslash_indent(self): 

500 """Return number of spaces the next line should be indented. 

501 

502 Line continuation must be C_BACKSLASH. Also assume that the new 

503 line is the first one following the initial line of the stmt. 

504 """ 

505 self._study2() 1a

506 assert self.continuation == C_BACKSLASH 1a

507 code = self.code 1a

508 i = self.stmt_start 1a

509 while code[i] in " \t": 1a

510 i = i+1 1a

511 startpos = i 1a

512 

513 # See whether the initial line starts an assignment stmt; i.e., 

514 # look for an = operator 

515 endpos = code.find('\n', startpos) + 1 1a

516 found = level = 0 1a

517 while i < endpos: 1a

518 ch = code[i] 1a

519 if ch in "([{": 1a

520 level = level + 1 1a

521 i = i+1 1a

522 elif ch in ")]}": 1a

523 if level: 523 ↛ 525line 523 didn't jump to line 525, because the condition on line 523 was never false1a

524 level = level - 1 1a

525 i = i+1 1a

526 elif ch == '"' or ch == "'": 1a

527 i = _match_stringre(code, i, endpos).end() 1a

528 elif ch == '#': 528 ↛ 531line 528 didn't jump to line 531, because the condition on line 528 was never true1a

529 # This line is unreachable because the # makes a comment of 

530 # everything after it. 

531 break 

532 elif level == 0 and ch == '=' and \ 1a

533 (i == 0 or code[i-1] not in "=<>!") and \ 

534 code[i+1] != '=': 

535 found = 1 1a

536 break 1a

537 else: 

538 i = i+1 1a

539 

540 if found: 1a

541 # found a legit =, but it may be the last interesting 

542 # thing on the line 

543 i = i+1 # move beyond the = 1a

544 found = re.match(r"\s*\\", code[i:endpos]) is None 1a

545 

546 if not found: 1a

547 # oh well ... settle for moving beyond the first chunk 

548 # of non-whitespace chars 

549 i = startpos 1a

550 while code[i] not in " \t\n": 1a

551 i = i+1 1a

552 

553 return len(code[self.stmt_start:i].expandtabs(\ 1a

554 self.tabwidth)) + 1 

555 

556 def get_base_indent_string(self): 

557 """Return the leading whitespace on the initial line of the last 

558 interesting stmt. 

559 """ 

560 self._study2() 1g

561 i, n = self.stmt_start, self.stmt_end 1g

562 j = i 1g

563 code = self.code 1g

564 while j < n and code[j] in " \t": 1g

565 j = j + 1 1g

566 return code[i:j] 1g

567 

568 def is_block_opener(self): 

569 "Return True if the last interesting statement opens a block." 

570 self._study2() 1d

571 return self.lastch == ':' 1d

572 

573 def is_block_closer(self): 

574 "Return True if the last interesting statement closes a block." 

575 self._study2() 1e

576 return _closere(self.code, self.stmt_start) is not None 1e

577 

578 def get_last_stmt_bracketing(self): 

579 """Return bracketing structure of the last interesting statement. 

580 

581 The returned tuple is in the format defined in _study2(). 

582 """ 

583 self._study2() 1hijfk

584 return self.stmt_bracketing 1hijfk

585 

586 

587if __name__ == '__main__': 587 ↛ 588line 587 didn't jump to line 588, because the condition on line 587 was never true

588 from unittest import main 

589 main('idlelib.idle_test.test_pyparse', verbosity=2)