/[cl-ppcre]/cl-ppcre/api.lisp
ViewVC logotype

Contents of /cl-ppcre/api.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (show annotations)
Thu Apr 22 18:53:13 2004 UTC (9 years, 11 months ago) by eweitz
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +11 -3 lines
pre-0.7.7, including hyperdoc support
1 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
2 ;;; $Header: /tiger/var/lib/cvsroots/cl-ppcre/cl-ppcre/api.lisp,v 1.2 2004/04/22 18:53:13 eweitz Exp $
3
4 ;;; The external API for creating and using scanners.
5
6 ;;; Copyright (c) 2002-2004, Dr. Edmund Weitz. All rights reserved.
7
8 ;;; Redistribution and use in source and binary forms, with or without
9 ;;; modification, are permitted provided that the following conditions
10 ;;; are met:
11
12 ;;; * Redistributions of source code must retain the above copyright
13 ;;; notice, this list of conditions and the following disclaimer.
14
15 ;;; * Redistributions in binary form must reproduce the above
16 ;;; copyright notice, this list of conditions and the following
17 ;;; disclaimer in the documentation and/or other materials
18 ;;; provided with the distribution.
19
20 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
21 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
24 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
26 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
32 (in-package #:cl-ppcre)
33
34 (defgeneric create-scanner (regex &key case-insensitive-mode
35 multi-line-mode
36 single-line-mode
37 extended-mode
38 destructive)
39 (:documentation "Accepts a regular expression - either as a
40 parse-tree or as a string - and returns a scan closure which will scan
41 strings for this regular expression. The \"mode\" keyboard arguments
42 are equivalent to the imsx modifiers in Perl. If DESTRUCTIVE is not
43 NIL the function is allowed to destructively modify its first argument
44 \(but only if it's a parse tree)."))
45
46 (defmethod create-scanner ((regex-string string) &key case-insensitive-mode
47 multi-line-mode
48 single-line-mode
49 extended-mode
50 destructive)
51 (declare (optimize speed
52 (safety 0)
53 (space 0)
54 (debug 0)
55 (compilation-speed 0)
56 #+:lispworks (hcl:fixnum-safety 0)))
57 (declare (ignore destructive))
58 ;; parse the string into a parse-tree and then call CREATE-SCANNER
59 ;; again
60 (let* ((*extended-mode-p* extended-mode)
61 (quoted-regex-string (if *allow-quoting*
62 (quote-sections (clean-comments regex-string extended-mode))
63 regex-string))
64 (*syntax-error-string* (copy-seq quoted-regex-string)))
65 ;; wrap the result with :GROUP to avoid infinite loops for
66 ;; constant strings
67 (create-scanner (cons :group (list (parse-string quoted-regex-string)))
68 :case-insensitive-mode case-insensitive-mode
69 :multi-line-mode multi-line-mode
70 :single-line-mode single-line-mode
71 :destructive t)))
72
73 (defmethod create-scanner ((scanner function) &key case-insensitive-mode
74 multi-line-mode
75 single-line-mode
76 extended-mode
77 destructive)
78 (declare (optimize speed
79 (safety 0)
80 (space 0)
81 (debug 0)
82 (compilation-speed 0)
83 #+:lispworks (hcl:fixnum-safety 0)))
84 (declare (ignore destructive))
85 (when (or case-insensitive-mode multi-line-mode single-line-mode extended-mode)
86 (signal-ppcre-invocation-error
87 "You can't use the keyword arguments to modify an existing scanner."))
88 scanner)
89
90 (defmethod create-scanner ((parse-tree t) &key case-insensitive-mode
91 multi-line-mode
92 single-line-mode
93 extended-mode
94 destructive)
95 (declare (optimize speed
96 (safety 0)
97 (space 0)
98 (debug 0)
99 (compilation-speed 0)
100 #+:lispworks (hcl:fixnum-safety 0)))
101 (when extended-mode
102 (signal-ppcre-invocation-error
103 "Extended mode doesn't make sense in parse trees."))
104 ;; convert parse-tree into internal representation REGEX and at the
105 ;; same time compute the number of registers and the constant string
106 ;; (or anchor) the regex starts with (if any)
107 (unless destructive
108 (setq parse-tree (copy-tree parse-tree)))
109 (let (flags)
110 (if single-line-mode
111 (push :single-line-mode-p flags))
112 (if multi-line-mode
113 (push :multi-line-mode-p flags))
114 (if case-insensitive-mode
115 (push :case-insensitive-p flags))
116 (when flags
117 (setq parse-tree (list :group (cons :flags flags) parse-tree))))
118 (let ((*syntax-error-string* nil))
119 (multiple-value-bind (regex reg-num starts-with)
120 (convert parse-tree)
121 ;; simplify REGEX by flattening nested SEQ and ALTERNATION
122 ;; constructs and gathering STR objects
123 (let ((regex (gather-strings (flatten regex))))
124 ;; set the MIN-REST slots of the REPETITION objects
125 (compute-min-rest regex 0)
126 ;; set the OFFSET slots of the STR objects
127 (compute-offsets regex 0)
128 (let* (end-string-offset
129 end-anchored-p
130 ;; compute the constant string the regex ends with (if
131 ;; any) and at the same time set the special variables
132 ;; END-STRING-OFFSET and END-ANCHORED-P
133 (end-string (end-string regex))
134 ;; if we found a non-zero-length end-string we create an
135 ;; efficient search function for it
136 (end-string-test (and end-string
137 (plusp (len end-string))
138 (if (= 1 (len end-string))
139 (create-char-searcher
140 (schar (str end-string) 0)
141 (case-insensitive-p end-string))
142 (create-bmh-matcher
143 (str end-string)
144 (case-insensitive-p end-string)))))
145 ;; initialize the counters for CREATE-MATCHER-AUX
146 (*rep-num* 0)
147 (*zero-length-num* 0)
148 ;; create the actual matcher function (which does all the
149 ;; work of matching the regular expression) corresponding
150 ;; to REGEX and at the same time set the special
151 ;; variables *REP-NUM* and *ZERO-LENGTH-NUM*
152 (match-fn (create-matcher-aux regex #'identity))
153 ;; if the regex starts with a string we create an
154 ;; efficient search function for it
155 (start-string-test (and (typep starts-with 'str)
156 (plusp (len starts-with))
157 (if (= 1 (len starts-with))
158 (create-char-searcher
159 (schar (str starts-with) 0)
160 (case-insensitive-p starts-with))
161 (create-bmh-matcher
162 (str starts-with)
163 (case-insensitive-p starts-with))))))
164 (declare (special end-string-offset end-anchored-p end-string))
165 ;; now create the scanner and return it
166 (create-scanner-aux match-fn
167 (regex-min-length regex)
168 (or (start-anchored-p regex)
169 ;; a dot in single-line-mode also
170 ;; implicitely anchors the regex at
171 ;; the start, i.e. if we can't match
172 ;; from the first position we won't
173 ;; match at all
174 (and (typep starts-with 'everything)
175 (single-line-p starts-with)))
176 starts-with
177 start-string-test
178 ;; only mark regex as end-anchored if we
179 ;; found a non-zero-length string before
180 ;; the anchor
181 (and end-string-test end-anchored-p)
182 end-string-test
183 (if end-string-test
184 (len end-string)
185 nil)
186 end-string-offset
187 *rep-num*
188 *zero-length-num*
189 reg-num))))))
190
191 (defgeneric scan (regex target-string &key start end)
192 (:documentation "Searches TARGET-STRING from START to END and tries
193 to match REGEX. On success returns four values - the start of the
194 match, the end of the match, and two arrays denoting the beginnings
195 and ends of register matches. On failure returns NIL. REGEX can be a
196 string which will be parsed according to Perl syntax, a parse tree, or
197 a pre-compiled scanner created by CREATE-SCANNER. TARGET-STRING will
198 be coerced to a simple string if it isn't one already."))
199
200 (defmethod scan ((regex-string string) target-string
201 &key (start 0)
202 (end (length target-string)))
203 (declare (optimize speed
204 (safety 0)
205 (space 0)
206 (debug 0)
207 (compilation-speed 0)
208 #+:lispworks (hcl:fixnum-safety 0)))
209 ;; note that the scanners are optimized for simple strings so we
210 ;; have to coerce TARGET-STRING into one if it isn't already
211 (funcall (create-scanner regex-string)
212 (maybe-coerce-to-simple-string target-string)
213 start end))
214
215 (defmethod scan ((scanner function) target-string
216 &key (start 0)
217 (end (length target-string)))
218 (declare (optimize speed
219 (safety 0)
220 (space 0)
221 (debug 0)
222 (compilation-speed 0)
223 #+:lispworks (hcl:fixnum-safety 0)))
224 (funcall scanner
225 (maybe-coerce-to-simple-string target-string)
226 start end))
227
228 (defmethod scan ((parse-tree t) target-string
229 &key (start 0)
230 (end (length target-string)))
231 (declare (optimize speed
232 (safety 0)
233 (space 0)
234 (debug 0)
235 (compilation-speed 0)
236 #+:lispworks (hcl:fixnum-safety 0)))
237 (funcall (create-scanner parse-tree)
238 (maybe-coerce-to-simple-string target-string)
239 start end))
240
241 (define-compiler-macro scan (&whole form regex target-string &rest rest)
242 "Make sure that constant forms are compiled into scanners at compile time."
243 (cond ((constantp regex)
244 `(scan (load-time-value
245 (create-scanner ,regex))
246 ,target-string ,@rest))
247 (t form)))
248
249 (defun scan-to-strings (regex target-string &key (start 0)
250 (end (length target-string))
251 sharedp)
252 (declare (optimize speed
253 (safety 0)
254 (space 0)
255 (debug 0)
256 (compilation-speed 0)
257 #+:lispworks (hcl:fixnum-safety 0)))
258 "Like SCAN but returns substrings of TARGET-STRING instead of
259 positions, i.e. this function returns two values on success: the whole
260 match as a string plus an array of substrings (or NILs) corresponding
261 to the matched registers. If SHAREDP is true, the substrings may share
262 structure with TARGET-STRING."
263 (multiple-value-bind (match-start match-end reg-starts reg-ends)
264 (scan regex target-string :start start :end end)
265 (unless match-start
266 (return-from scan-to-strings nil))
267 (let ((substr-fn (if sharedp #'nsubseq #'subseq)))
268 (values (funcall substr-fn
269 target-string match-start match-end)
270 (map 'vector
271 (lambda (reg-start reg-end)
272 (if reg-start
273 (funcall substr-fn
274 target-string reg-start reg-end)
275 nil))
276 reg-starts
277 reg-ends)))))
278
279 (defmacro register-groups-bind (var-list (regex target-string
280 &key start end sharedp)
281 &body body)
282 "Executes BODY with the variables in VAR-LIST bound to the
283 corresponding register groups after TARGET-STRING has been matched
284 against REGEX, i.e. each variable is either bound to a string or to
285 NIL. If there is no match, BODY is _not_ executed. For each element of
286 VAR-LIST which is NIL there's no binding to the corresponding register
287 group. The number of variables in VAR-LIST must not be greater than
288 the number of register groups. If SHAREDP is true, the substrings may
289 share structure with TARGET-STRING."
290 (rebinding (target-string)
291 (with-unique-names (match-start match-end reg-starts reg-ends
292 start-index substr-fn)
293 `(multiple-value-bind (,match-start ,match-end ,reg-starts ,reg-ends)
294 (scan ,regex ,target-string :start (or ,start 0)
295 :end (or ,end (length ,target-string)))
296 (declare (ignore ,match-end))
297 (when ,match-start
298 (let* ,(cons
299 `(,substr-fn (if ,sharedp
300 #'nsubseq
301 #'subseq))
302 (loop for var in var-list
303 for counter from 0
304 when var
305 collect `(,var (let ((,start-index
306 (aref ,reg-starts ,counter)))
307 (if ,start-index
308 (funcall ,substr-fn
309 ,target-string
310 ,start-index
311 (aref ,reg-ends ,counter))
312 nil)))))
313 ,@body))))))
314
315 (defmacro do-scans ((match-start match-end reg-starts reg-ends regex
316 target-string
317 &optional result-form
318 &key start end)
319 &body body)
320 "Iterates over TARGET-STRING and tries to match REGEX as often as
321 possible evaluating BODY with MATCH-START, MATCH-END, REG-STARTS, and
322 REG-ENDS bound to the four return values of each match in turn. After
323 the last match, returns RESULT-FORM if provided or NIL otherwise. An
324 implicit block named NIL surrounds DO-SCANS; RETURN may be used to
325 terminate the loop immediately. If REGEX matches an empty string the
326 scan is continued one position behind this match. BODY may start with
327 declarations."
328 (rebinding (target-string regex)
329 (with-unique-names (%start %end scanner loop-tag block-name)
330 ;; the NIL BLOCK to enable exits via (RETURN ...)
331 `(block nil
332 (let* ((,%start (or ,start 0))
333 (*real-start-pos* ,%start)
334 (,%end (or ,end (length ,target-string)))
335 ;; create a scanner unless the regex is already a
336 ;; function (otherwise SCAN will do this on each
337 ;; iteration)
338 (,scanner (typecase ,regex
339 (function ,regex)
340 (t (create-scanner ,regex)))))
341 ;; coerce TARGET-STRING to a simple string unless it is one
342 ;; already (otherwise SCAN will do this on each iteration)
343 (setq ,target-string
344 (maybe-coerce-to-simple-string ,target-string))
345 ;; a named BLOCK so we can exit the TAGBODY
346 (block ,block-name
347 (tagbody
348 ,loop-tag
349 ;; invoke SCAN and bind the returned values to the
350 ;; provided variables
351 (multiple-value-bind
352 (,match-start ,match-end ,reg-starts ,reg-ends)
353 (scan ,scanner ,target-string :start ,%start :end ,%end)
354 ;; declare the variables to be IGNORABLE to prevent the
355 ;; compiler from issuing warnings
356 (declare
357 (ignorable ,match-start ,match-end ,reg-starts ,reg-ends))
358 (unless ,match-start
359 ;; stop iteration on first failure
360 (return-from ,block-name ,result-form))
361 ;; execute BODY (wrapped in LOCALLY so it can start with
362 ;; declarations)
363 (locally
364 ,@body)
365 ;; advance by one position if we had a zero-length match
366 (setq ,%start (if (= ,%start ,match-end)
367 (1+ ,match-end)
368 ,match-end)))
369 (go ,loop-tag))))))))
370
371 (defmacro do-matches ((match-start match-end regex
372 target-string
373 &optional result-form
374 &key start end)
375 &body body)
376 "Iterates over TARGET-STRING and tries to match REGEX as often as
377 possible evaluating BODY with MATCH-START and MATCH-END bound to the
378 start/end positions of each match in turn. After the last match,
379 returns RESULT-FORM if provided or NIL otherwise. An implicit block
380 named NIL surrounds DO-MATCHES; RETURN may be used to terminate the
381 loop immediately. If REGEX matches an empty string the scan is
382 continued one position behind this match. BODY may start with
383 declarations."
384 ;; this is a simplified form of DO-SCANS - we just provide two dummy
385 ;; vars and ignore them
386 (with-unique-names (reg-starts reg-ends)
387 `(do-scans (,match-start ,match-end
388 ,reg-starts ,reg-ends
389 ,regex ,target-string
390 ,result-form
391 :start ,start :end ,end)
392 ,@body)))
393
394 (defmacro do-matches-as-strings ((match-var regex
395 target-string
396 &optional result-form
397 &key start end sharedp)
398 &body body)
399 "Iterates over TARGET-STRING and tries to match REGEX as often as
400 possible evaluating BODY with MATCH-VAR bound to the substring of
401 TARGET-STRING corresponding to each match in turn. After the last
402 match, returns RESULT-FORM if provided or NIL otherwise. An implicit
403 block named NIL surrounds DO-MATCHES-AS-STRINGS; RETURN may be used to
404 terminate the loop immediately. If REGEX matches an empty string the
405 scan is continued one position behind this match. If SHAREDP is true,
406 the substrings may share structure with TARGET-STRING. BODY may start
407 with declarations."
408 (rebinding (target-string)
409 (with-unique-names (match-start match-end substr-fn)
410 `(let ((,substr-fn (if ,sharedp #'nsubseq #'subseq)))
411 ;; simple use DO-MATCHES to extract the substrings
412 (do-matches (,match-start ,match-end ,regex ,target-string
413 ,result-form :start ,start :end ,end)
414 (let ((,match-var
415 (funcall ,substr-fn
416 ,target-string ,match-start ,match-end)))
417 ,@body))))))
418
419 (defmacro do-register-groups (var-list (regex target-string
420 &optional result-form
421 &key start end sharedp)
422 &body body)
423 "Iterates over TARGET-STRING and tries to match REGEX as often as
424 possible evaluating BODY with the variables in VAR-LIST bound to the
425 corresponding register groups for each match in turn, i.e. each
426 variable is either bound to a string or to NIL. For each element of
427 VAR-LIST which is NIL there's no binding to the corresponding register
428 group. The number of variables in VAR-LIST must not be greater than
429 the number of register groups. After the last match, returns
430 RESULT-FORM if provided or NIL otherwise. An implicit block named NIL
431 surrounds DO-REGISTER-GROUPS; RETURN may be used to terminate the loop
432 immediately. If REGEX matches an empty string the scan is continued
433 one position behind this match. If SHAREDP is true, the substrings may
434 share structure with TARGET-STRING. BODY may start with declarations."
435 (rebinding (target-string)
436 (with-unique-names (substr-fn match-start match-end
437 reg-starts reg-ends start-index)
438 `(let ((,substr-fn (if ,sharedp
439 #'nsubseq
440 #'subseq)))
441 (do-scans (,match-start ,match-end ,reg-starts ,reg-ends
442 ,regex ,target-string
443 ,result-form :start ,start :end ,end)
444 (let ,(loop for var in var-list
445 for counter from 0
446 collect `(,var (let ((,start-index
447 (aref ,reg-starts ,counter)))
448 (if ,start-index
449 (funcall ,substr-fn
450 ,target-string
451 ,start-index
452 (aref ,reg-ends ,counter))
453 nil))))
454 ,@body))))))
455
456 (defun all-matches (regex target-string
457 &key (start 0)
458 (end (length target-string)))
459 (declare (optimize speed
460 (safety 0)
461 (space 0)
462 (debug 0)
463 (compilation-speed 0)
464 #+:lispworks (hcl:fixnum-safety 0)))
465 "Returns a list containing the start and end positions of all
466 matches of REGEX against TARGET-STRING, i.e. if there are N matches
467 the list contains (* 2 N) elements. If REGEX matches an empty string
468 the scan is continued one position behind this match."
469 (let (result-list)
470 (do-matches (match-start match-end
471 regex target-string
472 (nreverse result-list)
473 :start start :end end)
474 (push match-start result-list)
475 (push match-end result-list))))
476
477 (defun all-matches-as-strings (regex target-string
478 &key (start 0)
479 (end (length target-string))
480 sharedp)
481 (declare (optimize speed
482 (safety 0)
483 (space 0)
484 (debug 0)
485 (compilation-speed 0)
486 #+:lispworks (hcl:fixnum-safety 0)))
487 "Returns a list containing all substrings of TARGET-STRING which
488 match REGEX. If REGEX matches an empty string the scan is continued
489 one position behind this match. If SHAREDP is true, the substrings may
490 share structure with TARGET-STRING."
491 (let (result-list)
492 (do-matches-as-strings (match regex target-string (nreverse result-list)
493 :start start :end end :sharedp sharedp)
494 (push match result-list))))
495
496 (defun split (regex target-string
497 &key (start 0)
498 (end (length target-string))
499 limit
500 with-registers-p
501 omit-unmatched-p
502 sharedp)
503 (declare (optimize speed
504 (safety 0)
505 (space 0)
506 (debug 0)
507 (compilation-speed 0)
508 #+:lispworks (hcl:fixnum-safety 0)))
509 "Matches REGEX against TARGET-STRING as often as possible and
510 returns a list of the substrings between the matches. If
511 WITH-REGISTERS-P is true, substrings corresponding to matched
512 registers are inserted into the list as well. If OMIT-UNMATCHED-P is
513 true, unmatched registers will simply be left out, otherwise they will
514 show up as NIL. LIMIT limits the number of elements returned -
515 registers aren't counted. If LIMIT is NIL (or 0 which is equivalent),
516 trailing empty strings are removed from the result list. If REGEX
517 matches an empty string the scan is continued one position behind this
518 match. If SHAREDP is true, the substrings may share structure with
519 TARGET-STRING."
520 ;; initialize list of positions POS-LIST to extract substrings with
521 ;; START so that the start of the next match will mark the end of
522 ;; the first substring
523 (let ((pos-list (list start))
524 (counter 0))
525 ;; how would Larry Wall do it?
526 (when (eql limit 0)
527 (setq limit nil))
528 (do-scans (match-start match-end
529 reg-starts reg-ends
530 regex target-string nil
531 :start start :end end)
532 (unless (and (= match-start match-end)
533 (= match-start (car pos-list)))
534 ;; push start of match on list unless this would be an empty
535 ;; string adjacent to the last element pushed onto the list
536 (when (and limit
537 (>= (incf counter) limit))
538 (return))
539 (push match-start pos-list)
540 (when with-registers-p
541 ;; optionally insert matched registers
542 (loop for reg-start across reg-starts
543 for reg-end across reg-ends
544 if reg-start
545 ;; but only if they've matched
546 do (push reg-start pos-list)
547 (push reg-end pos-list)
548 else unless omit-unmatched-p
549 ;; or if we're allowed to insert NIL instead
550 do (push nil pos-list)
551 (push nil pos-list)))
552 ;; now end of match
553 (push match-end pos-list)))
554 ;; end of whole string
555 (push end pos-list)
556 ;; now collect substrings
557 (nreverse
558 (loop with substr-fn = (if sharedp #'nsubseq #'subseq)
559 with string-seen = nil
560 for (this-end this-start) on pos-list by #'cddr
561 ;; skip empty strings from end of list
562 if (or limit
563 (setq string-seen
564 (or string-seen
565 (and this-start
566 (> this-end this-start)))))
567 collect (if this-start
568 (funcall substr-fn
569 target-string this-start this-end)
570 nil)))))
571
572 (define-compiler-macro split (&whole form regex target-string &rest rest)
573 "Make sure that constant forms are compiled into scanners at compile time."
574 (cond ((constantp regex)
575 `(split (load-time-value
576 (create-scanner ,regex))
577 ,target-string ,@rest))
578 (t form)))
579
580 (defun string-case-modifier (str from to start end)
581 (declare (optimize speed
582 (safety 0)
583 (space 0)
584 (debug 0)
585 (compilation-speed 0)
586 #+:lispworks (hcl:fixnum-safety 0)))
587 (declare (type fixnum from to start end))
588 "Checks whether all words in STR between FROM and TO are upcased,
589 downcased or capitalized and returns a function which applies a
590 corresponding case modification to strings. Returns #'IDENTITY
591 otherwise, especially if words in the target area extend beyond FROM
592 or TO. STR is supposed to be bounded by START and END. It is assumed
593 that (<= START FROM TO END)."
594 (case
595 (if (or (<= to from)
596 (and (< start from)
597 (alphanumericp (char str (1- from)))
598 (alphanumericp (char str from)))
599 (and (< to end)
600 (alphanumericp (char str to))
601 (alphanumericp (char str (1- to)))))
602 ;; if it's a zero-length string or if words extend beyond FROM
603 ;; or TO we return NIL, i.e. #'IDENTITY
604 nil
605 ;; otherwise we loop through STR from FROM to TO
606 (loop with last-char-both-case
607 with current-result
608 for index of-type fixnum from from below to
609 for chr = (char str index)
610 do (cond ((not #-:cormanlisp (both-case-p chr)
611 #+:cormanlisp (or (upper-case-p chr)
612 (lower-case-p chr)))
613 ;; this character doesn't have a case so we
614 ;; consider it as a word boundary (note that
615 ;; this differs from how \b works in Perl)
616 (setq last-char-both-case nil))
617 ((upper-case-p chr)
618 ;; an uppercase character
619 (setq current-result
620 (if last-char-both-case
621 ;; not the first character in a
622 (case current-result
623 ((:undecided) :upcase)
624 ((:downcase :capitalize) (return nil))
625 ((:upcase) current-result))
626 (case current-result
627 ((nil) :undecided)
628 ((:downcase) (return nil))
629 ((:capitalize :upcase) current-result)))
630 last-char-both-case t))
631 (t
632 ;; a lowercase character
633 (setq current-result
634 (case current-result
635 ((nil) :downcase)
636 ((:undecided) :capitalize)
637 ((:downcase) current-result)
638 ((:capitalize) (if last-char-both-case
639 current-result
640 (return nil)))
641 ((:upcase) (return nil)))
642 last-char-both-case t)))
643 finally (return current-result)))
644 ((nil) #'identity)
645 ((:undecided :upcase) #'string-upcase)
646 ((:downcase) #'string-downcase)
647 ((:capitalize) #'string-capitalize)))
648
649 ;; first create a scanner to identify the special parts of the
650 ;; replacement string (eat your own dog food...)
651 #-:cormanlisp
652 (let* ((*use-bmh-matchers* nil)
653 (reg-scanner (create-scanner "\\\\(?:\\\\|{\\d+}|\\d+|&|`|')")))
654 (defmethod build-replacement-template ((replacement-string string))
655 (declare (optimize speed
656 (safety 0)
657 (space 0)
658 (debug 0)
659 (compilation-speed 0)
660 #+:lispworks (hcl:fixnum-safety 0)))
661 "Converts a replacement string for REGEX-REPLACE or
662 REGEX-REPLACE-ALL into a replacement template which is an
663 S-expression."
664 (let ((from 0)
665 ;; COLLECTOR will hold the (reversed) template
666 (collector '()))
667 ;; scan through all special parts of the replacement string
668 (do-matches (match-start match-end reg-scanner replacement-string)
669 (when (< from match-start)
670 ;; strings between matches are copied verbatim
671 (push (subseq replacement-string from match-start) collector))
672 ;; PARSE-START is true if the pattern matched a number which
673 ;; refers to a register
674 (let* ((parse-start (position-if #'digit-char-p
675 replacement-string
676 :start match-start
677 :end match-end))
678 (token (if parse-start
679 (1- (parse-integer replacement-string
680 :start parse-start
681 :junk-allowed t))
682 ;; if we didn't match a number we convert the
683 ;; character to a symbol
684 (case (char replacement-string (1+ match-start))
685 ((#\&) :match)
686 ((#\`) :before-match)
687 ((#\') :after-match)
688 ((#\\) :backslash)))))
689 (when (and (numberp token) (< token 0))
690 ;; make sure we don't accept something like "\\0"
691 (signal-ppcre-invocation-error
692 "Illegal substring ~S in replacement string"
693 (subseq replacement-string match-start match-end)))
694 (push token collector))
695 ;; remember where the match ended
696 (setq from match-end))
697 (when (< from (length replacement-string))
698 ;; push the rest of the replacement string onto the list
699 (push (subseq replacement-string from) collector))
700 (nreverse collector))))
701
702 #-:cormanlisp
703 (defmethod build-replacement-template ((replacement-function function))
704 (list replacement-function))
705
706 #-:cormanlisp
707 (defmethod build-replacement-template ((replacement-function-symbol symbol))
708 (list replacement-function-symbol))
709
710 #-:cormanlisp
711 (defmethod build-replacement-template ((replacement-list list))
712 replacement-list)
713
714 ;;; Corman Lisp's methods can't be closures... :(
715 #+:cormanlisp
716 (let* ((*use-bmh-matchers* nil)
717 (reg-scanner (create-scanner "\\\\(?:\\\\|{\\d+}|\\d+|&|`|')")))
718 (defun build-replacement-template (replacement)
719 (declare (optimize speed
720 (safety 0)
721 (space 0)
722 (debug 0)
723 (compilation-speed 0)
724 #+:lispworks (hcl:fixnum-safety 0)))
725 (typecase replacement
726 (string
727 (let ((from 0)
728 ;; COLLECTOR will hold the (reversed) template
729 (collector '()))
730 ;; scan through all special parts of the replacement string
731 (do-matches (match-start match-end reg-scanner replacement)
732 (when (< from match-start)
733 ;; strings between matches are copied verbatim
734 (push (subseq replacement from match-start) collector))
735 ;; PARSE-START is true if the pattern matched a number which
736 ;; refers to a register
737 (let* ((parse-start (position-if #'digit-char-p
738 replacement
739 :start match-start
740 :end match-end))
741 (token (if parse-start
742 (1- (parse-integer replacement
743 :start parse-start
744 :junk-allowed t))
745 ;; if we didn't match a number we convert the
746 ;; character to a symbol
747 (case (char replacement (1+ match-start))
748 ((#\&) :match)
749 ((#\`) :before-match)
750 ((#\') :after-match)
751 ((#\\) :backslash)))))
752 (when (and (numberp token) (< token 0))
753 ;; make sure we don't accept something like "\\0"
754 (signal-ppcre-invocation-error
755 "Illegal substring ~S in replacement string"
756 (subseq replacement match-start match-end)))
757 (push token collector))
758 ;; remember where the match ended
759 (setq from match-end))
760 (when (< from (length replacement))
761 ;; push the rest of the replacement string onto the list
762 (push (nsubseq replacement from) collector))
763 (nreverse collector)))
764 (list
765 replacement)
766 (t
767 (list replacement)))))
768
769 (defun build-replacement (replacement-template
770 target-string
771 start end
772 match-start match-end
773 reg-starts reg-ends)
774 (declare (optimize speed
775 (safety 0)
776 (space 0)
777 (debug 0)
778 (compilation-speed 0)
779 #+:lispworks (hcl:fixnum-safety 0)))
780 "Accepts a replacement template and the current values from the
781 matching process in REGEX-REPLACE or REGEX-REPLACE-ALL and returns the
782 corresponding template."
783 ;; the upper exclusive bound of the register numbers in the regular
784 ;; expression
785 (let ((reg-bound (if reg-starts
786 (array-dimension reg-starts 0)
787 0)))
788 (with-output-to-string (s)
789 (loop for token in replacement-template
790 do (typecase token
791 (string
792 ;; transfer string parts verbatim
793 (write-string token s))
794 (integer
795 ;; replace numbers with the corresponding registers
796 (when (>= token reg-bound)
797 ;; but only if the register was referenced in the
798 ;; regular expression
799 (signal-ppcre-invocation-error
800 "Reference to non-existent register ~A in replacement string"
801 (1+ token)))
802 (when (svref reg-starts token)
803 ;; and only if it matched, i.e. no match results
804 ;; in an empty string
805 (write-string target-string s
806 :start (svref reg-starts token)
807 :end (svref reg-ends token))))
808 (function
809 (write-string (funcall token
810 target-string
811 start end
812 match-start match-end
813 reg-starts reg-ends)
814 s))
815 (symbol
816 (case token
817 ((:backslash)
818 ;; just a backslash
819 (write-char #\\ s))
820 ((:match)
821 ;; the whole match
822 (write-string target-string s
823 :start match-start
824 :end match-end))
825 ((:before-match)
826 ;; the part of the target string before the match
827 (write-string target-string s
828 :start start
829 :end match-start))
830 ((:after-match)
831 ;; the part of the target string after the match
832 (write-string target-string s
833 :start match-end
834 :end end))
835 (otherwise
836 (write-string (funcall token
837 target-string
838 start end
839 match-start match-end
840 reg-starts reg-ends)
841 s)))))))))
842
843 (defun replace-aux (target-string replacement pos-list reg-list start end preserve-case)
844 (declare (optimize speed
845 (safety 0)
846 (space 0)
847 (debug 0)
848 (compilation-speed 0)
849 #+:lispworks (hcl:fixnum-safety 0)))
850 "Auxiliary function used by REGEX-REPLACE and
851 REGEX-REPLACE-ALL. POS-LIST contains a list with the start and end
852 positions of all matches while REG-LIST contains a list of arrays
853 representing the corresponding register start and end positions."
854 ;; build the template once before we start the loop
855 (let ((replacement-template (build-replacement-template replacement)))
856 (with-output-to-string (s)
857 ;; loop through all matches and take the start and end of the
858 ;; whole string into account
859 (loop for (from to) on (append (list start) pos-list (list end))
860 ;; alternate between replacement and no replacement
861 for replace = nil then (and (not replace) to)
862 for reg-starts = (if replace (pop reg-list) nil)
863 for reg-ends = (if replace (pop reg-list) nil)
864 for curr-replacement = (if replace
865 ;; build the replacement string
866 (build-replacement replacement-template
867 target-string
868 start end
869 from to
870 reg-starts reg-ends)
871 nil)
872 while to
873 if replace
874 do (write-string (if preserve-case
875 ;; modify the case of the replacement
876 ;; string if necessary
877 (funcall (string-case-modifier target-string
878 from to
879 start end)
880 curr-replacement)
881 curr-replacement)
882 s)
883 else
884 ;; no replacement
885 do (write-string target-string s :start from :end to)))))
886
887 (defun regex-replace (regex target-string replacement
888 &key (start 0)
889 (end (length target-string))
890 preserve-case)
891 (declare (optimize speed
892 (safety 0)
893 (space 0)
894 (debug 0)
895 (compilation-speed 0)
896 #+:lispworks (hcl:fixnum-safety 0)))
897 "Try to match TARGET-STRING between START and END against REGEX and
898 replace the first match with REPLACEMENT.
899
900 REPLACEMENT can be a string which may contain the special substrings
901 \"\\&\" for the whole match, \"\\`\" for the part of TARGET-STRING
902 before the match, \"\\'\" for the part of TARGET-STRING after the
903 match, \"\\N\" or \"\\{N}\" for the Nth register where N is a positive
904 integer.
905
906 REPLACEMENT can also be a function designator in which case the
907 match will be replaced with the result of calling the function
908 designated by REPLACEMENT with the arguments TARGET-STRING, START,
909 END, MATCH-START, MATCH-END, REG-STARTS, and REG-ENDS. (REG-STARTS and
910 REG-ENDS are arrays holding the start and end positions of matched
911 registers or NIL - the meaning of the other arguments should be
912 obvious.)
913
914 Finally, REPLACEMENT can be a list where each element is a string,
915 one of the symbols :MATCH, :BEFORE-MATCH, or :AFTER-MATCH -
916 corresponding to \"\\&\", \"\\`\", and \"\\'\" above -, an integer N -
917 representing register (1+ N) -, or a function designator.
918
919 If PRESERVE-CASE is true, the replacement will try to preserve the
920 case (all upper case, all lower case, or capitalized) of the
921 match. The result will always be a fresh string, even if REGEX doesn't
922 match."
923 (multiple-value-bind (match-start match-end reg-starts reg-ends)
924 (scan regex target-string :start start :end end)
925 (if match-start
926 (replace-aux target-string replacement
927 (list match-start match-end)
928 (list reg-starts reg-ends)
929 start end preserve-case)
930 (subseq target-string start end))))
931
932 (defun regex-replace-all (regex target-string replacement
933 &key (start 0)
934 (end (length target-string))
935 preserve-case)
936 (declare (optimize speed
937 (safety 0)
938 (space 0)
939 (debug 0)
940 (compilation-speed 0)
941 #+:lispworks (hcl:fixnum-safety 0)))
942 "Try to match TARGET-STRING between START and END against REGEX and
943 replace all matches with REPLACEMENT.
944
945 REPLACEMENT can be a string which may contain the special substrings
946 \"\\&\" for the whole match, \"\\`\" for the part of TARGET-STRING
947 before the match, \"\\'\" for the part of TARGET-STRING after the
948 match, \"\\N\" or \"\\{N}\" for the Nth register where N is a positive
949 integer.
950
951 REPLACEMENT can also be a function designator in which case the
952 match will be replaced with the result of calling the function
953 designated by REPLACEMENT with the arguments TARGET-STRING, START,
954 END, MATCH-START, MATCH-END, REG-STARTS, and REG-ENDS. (REG-STARTS and
955 REG-ENDS are arrays holding the start and end positions of matched
956 registers or NIL - the meaning of the other arguments should be
957 obvious.)
958
959 Finally, REPLACEMENT can be a list where each element is a string,
960 one of the symbols :MATCH, :BEFORE-MATCH, or :AFTER-MATCH -
961 corresponding to \"\\&\", \"\\`\", and \"\\'\" above -, an integer N -
962 representing register (1+ N) -, or a function designator.
963
964 If PRESERVE-CASE is true, the replacement will try to preserve the
965 case (all upper case, all lower case, or capitalized) of the
966 match. The result will always be a fresh string, even if REGEX doesn't
967 match."
968 (let ((pos-list '())
969 (reg-list '()))
970 (do-scans (match-start match-end reg-starts reg-ends regex target-string
971 nil
972 :start start :end end)
973 (push match-start pos-list)
974 (push match-end pos-list)
975 (push reg-starts reg-list)
976 (push reg-ends reg-list))
977 (if pos-list
978 (replace-aux target-string replacement
979 (nreverse pos-list)
980 (nreverse reg-list)
981 start end preserve-case)
982 (subseq target-string start end))))
983
984 #-:cormanlisp
985 (defmacro regex-apropos-aux ((regex packages case-insensitive &optional return-form)
986 &body body)
987 "Auxiliary macro used by REGEX-APROPOS and REGEX-APROPOS-LIST. Loops
988 through PACKAGES and executes BODY with SYMBOL bound to each symbol
989 which matches REGEX. Optionally evaluates and returns RETURN-FORM at
990 the end. If CASE-INSENSITIVE is true and REGEX isn't already a
991 scanner, a case-insensitive scanner is used."
992 (rebinding (regex)
993 (with-unique-names (scanner %packages next morep)
994 `(let* ((,scanner (create-scanner ,regex
995 :case-insensitive-mode
996 (and ,case-insensitive
997 (not (functionp ,regex)))))
998 (,%packages (or ,packages
999 (list-all-packages))))
1000 (with-package-iterator (,next ,%packages :external :internal)
1001 (loop
1002 (multiple-value-bind (,morep symbol)
1003 (,next)
1004 (unless ,morep
1005 (return ,return-form))
1006 (when (scan ,scanner (symbol-name symbol))
1007 ,@body))))))))
1008
1009 ;;; The following two functions were provided by Karsten Poeck
1010
1011 #+:cormanlisp
1012 (defmacro do-with-all-symbols ((variable package-packagelist) &body body)
1013 (with-unique-names (pack-var iter-sym)
1014 `(if (listp ,package-packagelist)
1015 (dolist (,pack-var ,package-packagelist)
1016 (do-symbols (,iter-sym ,pack-var)
1017 ,@body))
1018 (do-symbols (,iter-sym ,package-packagelist)
1019 ,@body))))
1020
1021 #+:cormanlisp
1022 (defmacro regex-apropos-aux ((regex packages case-insensitive &optional return-form)
1023 &body body)
1024 "Auxiliary macro used by REGEX-APROPOS and REGEX-APROPOS-LIST. Loops
1025 through PACKAGES and executes BODY with SYMBOL bound to each symbol
1026 which matches REGEX. Optionally evaluates and returns RETURN-FORM at
1027 the end. If CASE-INSENSITIVE is true and REGEX isn't already a
1028 scanner, a case-insensitive scanner is used."
1029 (rebinding (regex)
1030 (with-unique-names (scanner %packages)
1031 `(let* ((,scanner (create-scanner ,regex
1032 :case-insensitive-mode
1033 (and ,case-insensitive
1034 (not (functionp ,regex)))))
1035 (,%packages (or ,packages
1036 (list-all-packages))))
1037 (do-with-all-symbols (symbol ,%packages)
1038 (when (scan ,scanner (symbol-name symbol))
1039 ,@body))
1040 ,return-form))))
1041
1042 (defun regex-apropos-list (regex &optional packages &key (case-insensitive t))
1043 (declare (optimize speed
1044 (safety 0)
1045 (space 0)
1046 (debug 0)
1047 (compilation-speed 0)
1048 #+:lispworks (hcl:fixnum-safety 0)))
1049 "Similar to the standard function APROPOS-LIST but returns a list of
1050 all symbols which match the regular expression REGEX. If
1051 CASE-INSENSITIVE is true and REGEX isn't already a scanner, a
1052 case-insensitive scanner is used."
1053 (let ((collector '()))
1054 (regex-apropos-aux (regex packages case-insensitive collector)
1055 (push symbol collector))))
1056
1057 (defun print-symbol-info (symbol)
1058 "Auxiliary function used by REGEX-APROPOS. Tries to print some
1059 meaningful information about a symbol."
1060 (declare (optimize speed
1061 (safety 0)
1062 (space 0)
1063 (debug 0)
1064 (compilation-speed 0)
1065 #+:lispworks (hcl:fixnum-safety 0)))
1066 (handler-case
1067 (let ((output-list '()))
1068 (cond ((special-operator-p symbol)
1069 (push "[special operator]" output-list))
1070 ((macro-function symbol)
1071 (push "[macro]" output-list))
1072 ((fboundp symbol)
1073 (let* ((function (symbol-function symbol))
1074 (compiledp (compiled-function-p function)))
1075 (multiple-value-bind (lambda-expr closurep)
1076 (function-lambda-expression function)
1077 (push
1078 (format nil
1079 "[~:[~;compiled ~]~:[function~;closure~]]~:[~; ~A~]"
1080 compiledp closurep lambda-expr (cadr lambda-expr))
1081 output-list)))))
1082 (let ((class (find-class symbol nil)))
1083 (when class
1084 (push (format nil "[class] ~S" class) output-list)))
1085 (cond ((keywordp symbol)
1086 (push "[keyword]" output-list))
1087 ((constantp symbol)
1088 (push (format nil "[constant]~:[~; value: ~S~]"
1089 (boundp symbol) (symbol-value symbol)) output-list))
1090 ((boundp symbol)
1091 (push #+(or LispWorks CLISP) "[variable]"
1092 #-(or LispWorks CLISP) (format nil "[variable] value: ~S"
1093 (symbol-value symbol))
1094 output-list)))
1095 #-(or :cormanlisp :clisp)
1096 (format t "~&~S ~<~;~^~A~@{~:@_~A~}~;~:>" symbol output-list)
1097 #+(or :cormanlisp :clisp)
1098 (loop for line in output-list
1099 do (format t "~&~S ~A" symbol line)))
1100 (condition ()
1101 ;; this seems to be necessary due to some errors I encountered
1102 ;; with LispWorks
1103 (format t "~&~S [an error occured while trying to print more info]" symbol))))
1104
1105 (defun regex-apropos (regex &optional packages &key (case-insensitive t))
1106 "Similar to the standard function APROPOS but returns a list of all
1107 symbols which match the regular expression REGEX. If CASE-INSENSITIVE
1108 is true and REGEX isn't already a scanner, a case-insensitive scanner
1109 is used."
1110 (declare (optimize speed
1111 (safety 0)
1112 (space 0)
1113 (debug 0)
1114 (compilation-speed 0)
1115 #+:lispworks (hcl:fixnum-safety 0)))
1116 (regex-apropos-aux (regex packages case-insensitive)
1117 (print-symbol-info symbol))
1118 (values))
1119
1120 (let* ((*use-bmh-matchers* nil)
1121 (non-word-char-scanner (create-scanner "[^a-zA-Z_0-9]")))
1122 (defun quote-meta-chars (string &key (start 0) (end (length string)))
1123 "Quote, i.e. prefix with #\\\\, all non-word characters in STRING."
1124 (regex-replace-all non-word-char-scanner string "\\\\\\&"
1125 :start start :end end)))
1126
1127 (let* ((*use-bmh-matchers* nil)
1128 (*allow-quoting* nil)
1129 (quote-char-scanner (create-scanner "\\\\Q"))
1130 (section-scanner (create-scanner "\\\\Q((?:[^\\\\]|\\\\(?!Q))*?)(?:\\\\E|$)")))
1131 (defun quote-sections (string)
1132 "Replace sections inside of STRING which are enclosed by \\Q and
1133 \\E with the quoted equivalent of these sections \(see
1134 QUOTE-META-CHARS). Repeat this as long as there are such
1135 sections. These sections may nest."
1136 (flet ((quote-substring (target-string start end match-start
1137 match-end reg-starts reg-ends)
1138 (declare (ignore start end match-start match-end))
1139 (quote-meta-chars target-string
1140 :start (svref reg-starts 0)
1141 :end (svref reg-ends 0))))
1142 (loop for result = string then (regex-replace-all section-scanner
1143 result
1144 #'quote-substring)
1145 while (scan quote-char-scanner result)
1146 finally (return result)))))
1147
1148 (let* ((*use-bmh-matchers* nil)
1149 (comment-scanner (create-scanner "(?s)\\(\\?#.*?\\)"))
1150 (extended-comment-scanner (create-scanner "(?m:#.*?$)|(?s:\\(\\?#.*?\\))"))
1151 (quote-token-scanner "\\\\[QE]")
1152 (quote-token-replace-scanner "\\\\([QE])"))
1153 (defun clean-comments (string &optional extended-mode)
1154 "Clean \(?#...) comments within STRING for quoting, i.e. convert
1155 \\Q to Q and \\E to E. If EXTENDED-MODE is true, also clean
1156 end-of-line comments, i.e. those starting with #\\# and ending with
1157 #\\Newline."
1158 (flet ((remove-tokens (target-string start end match-start
1159 match-end reg-starts reg-ends)
1160 (declare (ignore start end reg-starts reg-ends))
1161 (loop for result = (nsubseq target-string match-start match-end)
1162 then (regex-replace-all quote-token-replace-scanner result "\\1")
1163 ;; we must probably repeat this because the comment
1164 ;; can contain substrings like \\Q
1165 while (scan quote-token-scanner result)
1166 finally (return result))))
1167 (regex-replace-all (if extended-mode
1168 extended-comment-scanner
1169 comment-scanner)
1170 string
1171 #'remove-tokens))))

  ViewVC Help
Powered by ViewVC 1.1.5