/[slime]/slime/nregex.lisp
ViewVC logotype

Contents of /slime/nregex.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (show annotations)
Fri Jun 10 17:54:00 2005 UTC (8 years, 10 months ago) by heller
Branch: MAIN
Changes since 1.2: +5 -2 lines
Rename package to avoid name clashes with other version of this file.
1 ;;;
2 ;;; This code was written by:
3 ;;;
4 ;;; Lawrence E. Freil <lef@nscf.org>
5 ;;; National Science Center Foundation
6 ;;; Augusta, Georgia 30909
7 ;;;
8 ;;; If you modify this code, please comment your modifications
9 ;;; clearly and inform the author of any improvements so they
10 ;;; can be incorporated in future releases.
11 ;;;
12 ;;; nregex.lisp - My 4/8/92 attempt at a Lisp based regular expression
13 ;;; parser.
14 ;;;
15 ;;; This regular expression parser operates by taking a
16 ;;; regular expression and breaking it down into a list
17 ;;; consisting of lisp expressions and flags. The list
18 ;;; of lisp expressions is then taken in turned into a
19 ;;; lambda expression that can be later applied to a
20 ;;; string argument for parsing.
21 ;;;;
22 ;;;; Modifications made 6 March 2001 By Chris Double (chris@double.co.nz)
23 ;;;; to get working with Corman Lisp 1.42, add package statement and export
24 ;;;; relevant functions.
25 ;;;;
26
27 (in-package :cl-user)
28
29 ;; Renamed to slime-nregex avoid name clashes with other versions of
30 ;; this file. -- he
31
32 ;;;; CND - 6/3/2001
33 (defpackage slime-nregex
34 (:use #:common-lisp)
35 (:export
36 #:regex
37 #:regex-compile
38 ))
39
40 ;;;; CND - 6/3/2001
41 (in-package :slime-nregex)
42
43 ;;;
44 ;;; First we create a copy of macros to help debug the beast
45 (eval-when (:compile-toplevel :load-toplevel :execute)
46 (defvar *regex-debug* nil) ; Set to nil for no debugging code
47 )
48
49 (defmacro info (message &rest args)
50 (if *regex-debug*
51 `(format *standard-output* ,message ,@args)))
52
53 ;;;
54 ;;; Declare the global variables for storing the paren index list.
55 ;;;
56 (defvar *regex-groups* (make-array 10))
57 (defvar *regex-groupings* 0)
58
59 ;;;
60 ;;; Declare a simple interface for testing. You probably wouldn't want
61 ;;; to use this interface unless you were just calling this once.
62 ;;;
63 (defun regex (expression string)
64 "Usage: (regex <expression> <string)
65 This function will call regex-compile on the expression and then apply
66 the string to the returned lambda list."
67 (let ((findit (cond ((stringp expression)
68 (regex-compile expression))
69 ((listp expression)
70 expression)))
71 (result nil))
72 (if (not (funcall (if (functionp findit)
73 findit
74 (eval `(function ,findit))) string))
75 (return-from regex nil))
76 (if (= *regex-groupings* 0)
77 (return-from regex t))
78 (dotimes (i *regex-groupings*)
79 (push (funcall 'subseq
80 string
81 (car (aref *regex-groups* i))
82 (cadr (aref *regex-groups* i)))
83 result))
84 (reverse result)))
85
86 ;;;
87 ;;; Declare some simple macros to make the code more readable.
88 ;;;
89 (defvar *regex-special-chars* "?*+.()[]\\${}")
90
91 (defmacro add-exp (list)
92 "Add an item to the end of expression"
93 `(setf expression (append expression ,list)))
94
95 ;;;
96 ;;; Define a function that will take a quoted character and return
97 ;;; what the real character should be plus how much of the source
98 ;;; string was used. If the result is a set of characters, return an
99 ;;; array of bits indicating which characters should be set. If the
100 ;;; expression is one of the sub-group matches return a
101 ;;; list-expression that will provide the match.
102 ;;;
103 (defun regex-quoted (char-string &optional (invert nil))
104 "Usage: (regex-quoted <char-string> &optional invert)
105 Returns either the quoted character or a simple bit vector of bits set for
106 the matching values"
107 (let ((first (char char-string 0))
108 (result (char char-string 0))
109 (used-length 1))
110 (cond ((eql first #\n)
111 (setf result #\NewLine))
112 ((eql first #\c)
113 (setf result #\Return))
114 ((eql first #\t)
115 (setf result #\Tab))
116 ((eql first #\d)
117 (setf result #*0000000000000000000000000000000000000000000000001111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
118 ((eql first #\D)
119 (setf result #*1111111111111111111111111111111111111111111111110000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
120 ((eql first #\w)
121 (setf result #*0000000000000000000000000000000000000000000000001111111111000000011111111111111111111111111000010111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
122 ((eql first #\W)
123 (setf result #*1111111111111111111111111111111111111111111111110000000000111111100000000000000000000000000111101000000000000000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
124 ((eql first #\b)
125 (setf result #*0000000001000000000000000000000011000000000010100000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
126 ((eql first #\B)
127 (setf result #*1111111110111111111111111111111100111111111101011111111111011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
128 ((eql first #\s)
129 (setf result #*0000000001100000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
130 ((eql first #\S)
131 (setf result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
132 ((and (>= (char-code first) (char-code #\0))
133 (<= (char-code first) (char-code #\9)))
134 (if (and (> (length char-string) 2)
135 (and (>= (char-code (char char-string 1)) (char-code #\0))
136 (<= (char-code (char char-string 1)) (char-code #\9))
137 (>= (char-code (char char-string 2)) (char-code #\0))
138 (<= (char-code (char char-string 2)) (char-code #\9))))
139 ;;
140 ;; It is a single character specified in octal
141 ;;
142 (progn
143 (setf result (do ((x 0 (1+ x))
144 (return 0))
145 ((= x 2) return)
146 (setf return (+ (* return 8)
147 (- (char-code (char char-string x))
148 (char-code #\0))))))
149 (setf used-length 3))
150 ;;
151 ;; We have a group number replacement.
152 ;;
153 (let ((group (- (char-code first) (char-code #\0))))
154 (setf result `((let ((nstring (subseq string (car (aref *regex-groups* ,group))
155 (cadr (aref *regex-groups* ,group)))))
156 (if (< length (+ index (length nstring)))
157 (return-from compare nil))
158 (if (not (string= string nstring
159 :start1 index
160 :end1 (+ index (length nstring))))
161 (return-from compare nil)
162 (incf index (length nstring)))))))))
163 (t
164 (setf result first)))
165 (if (and (vectorp result) invert)
166 (bit-xor result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 t))
167 (values result used-length)))
168
169 ;;;
170 ;;; Now for the main regex compiler routine.
171 ;;;
172 (defun regex-compile (source &key (anchored nil))
173 "Usage: (regex-compile <expression> [ :anchored (t/nil) ])
174 This function take a regular expression (supplied as source) and
175 compiles this into a lambda list that a string argument can then
176 be applied to. It is also possible to compile this lambda list
177 for better performance or to save it as a named function for later
178 use"
179 (info "Now entering regex-compile with \"~A\"~%" source)
180 ;;
181 ;; This routine works in two parts.
182 ;; The first pass take the regular expression and produces a list of
183 ;; operators and lisp expressions for the entire regular expression.
184 ;; The second pass takes this list and produces the lambda expression.
185 (let ((expression '()) ; holder for expressions
186 (group 1) ; Current group index
187 (group-stack nil) ; Stack of current group endings
188 (result nil) ; holder for built expression.
189 (fast-first nil)) ; holder for quick unanchored scan
190 ;;
191 ;; If the expression was an empty string then it alway
192 ;; matches (so lets leave early)
193 ;;
194 (if (= (length source) 0)
195 (return-from regex-compile
196 '(lambda (&rest args)
197 (declare (ignore args))
198 t)))
199 ;;
200 ;; If the first character is a caret then set the anchored
201 ;; flags and remove if from the expression string.
202 ;;
203 (cond ((eql (char source 0) #\^)
204 (setf source (subseq source 1))
205 (setf anchored t)))
206 ;;
207 ;; If the first sequence is .* then also set the anchored flags.
208 ;; (This is purely for optimization, it will work without this).
209 ;;
210 (if (>= (length source) 2)
211 (if (string= source ".*" :start1 0 :end1 2)
212 (setf anchored t)))
213 ;;
214 ;; Also, If this is not an anchored search and the first character is
215 ;; a literal, then do a quick scan to see if it is even in the string.
216 ;; If not then we can issue a quick nil,
217 ;; otherwise we can start the search at the matching character to skip
218 ;; the checks of the non-matching characters anyway.
219 ;;
220 ;; If I really wanted to speed up this section of code it would be
221 ;; easy to recognize the case of a fairly long multi-character literal
222 ;; and generate a Boyer-Moore search for the entire literal.
223 ;;
224 ;; I generate the code to do a loop because on CMU Lisp this is about
225 ;; twice as fast a calling position.
226 ;;
227 (if (and (not anchored)
228 (not (position (char source 0) *regex-special-chars*))
229 (not (and (> (length source) 1)
230 (position (char source 1) *regex-special-chars*))))
231 (setf fast-first `((if (not (dotimes (i length nil)
232 (if (eql (char string i)
233 ,(char source 0))
234 (return (setf start i)))))
235 (return-from final-return nil)))))
236 ;;
237 ;; Generate the very first expression to save the starting index
238 ;; so that group 0 will be the entire string matched always
239 ;;
240 (add-exp '((setf (aref *regex-groups* 0)
241 (list index nil))))
242 ;;
243 ;; Loop over each character in the regular expression building the
244 ;; expression list as we go.
245 ;;
246 (do ((eindex 0 (1+ eindex)))
247 ((= eindex (length source)))
248 (let ((current (char source eindex)))
249 (info "Now processing character ~A index = ~A~%" current eindex)
250 (case current
251 ((#\.)
252 ;;
253 ;; Generate code for a single wild character
254 ;;
255 (add-exp '((if (>= index length)
256 (return-from compare nil)
257 (incf index)))))
258 ((#\$)
259 ;;
260 ;; If this is the last character of the expression then
261 ;; anchor the end of the expression, otherwise let it slide
262 ;; as a standard character (even though it should be quoted).
263 ;;
264 (if (= eindex (1- (length source)))
265 (add-exp '((if (not (= index length))
266 (return-from compare nil))))
267 (add-exp '((if (not (and (< index length)
268 (eql (char string index) #\$)))
269 (return-from compare nil)
270 (incf index))))))
271 ((#\*)
272 (add-exp '(ASTRISK)))
273
274 ((#\+)
275 (add-exp '(PLUS)))
276
277 ((#\?)
278 (add-exp '(QUESTION)))
279
280 ((#\()
281 ;;
282 ;; Start a grouping.
283 ;;
284 (incf group)
285 (push group group-stack)
286 (add-exp `((setf (aref *regex-groups* ,(1- group))
287 (list index nil))))
288 (add-exp `(,group)))
289 ((#\))
290 ;;
291 ;; End a grouping
292 ;;
293 (let ((group (pop group-stack)))
294 (add-exp `((setf (cadr (aref *regex-groups* ,(1- group)))
295 index)))
296 (add-exp `(,(- group)))))
297 ((#\[)
298 ;;
299 ;; Start of a range operation.
300 ;; Generate a bit-vector that has one bit per possible character
301 ;; and then on each character or range, set the possible bits.
302 ;;
303 ;; If the first character is carat then invert the set.
304 (let* ((invert (eql (char source (1+ eindex)) #\^))
305 (bitstring (make-array 256 :element-type 'bit
306 :initial-element
307 (if invert 1 0)))
308 (set-char (if invert 0 1)))
309 (if invert (incf eindex))
310 (do ((x (1+ eindex) (1+ x)))
311 ((eql (char source x) #\]) (setf eindex x))
312 (info "Building range with character ~A~%" (char source x))
313 (cond ((and (eql (char source (1+ x)) #\-)
314 (not (eql (char source (+ x 2)) #\])))
315 (if (>= (char-code (char source x))
316 (char-code (char source (+ 2 x))))
317 (error "Invalid range \"~A-~A\". Ranges must be in acending order"
318 (char source x) (char source (+ 2 x))))
319 (do ((j (char-code (char source x)) (1+ j)))
320 ((> j (char-code (char source (+ 2 x))))
321 (incf x 2))
322 (info "Setting bit for char ~A code ~A~%" (code-char j) j)
323 (setf (sbit bitstring j) set-char)))
324 (t
325 (cond ((not (eql (char source x) #\]))
326 (let ((char (char source x)))
327 ;;
328 ;; If the character is quoted then find out what
329 ;; it should have been
330 ;;
331 (if (eql (char source x) #\\ )
332 (let ((length))
333 (multiple-value-setq (char length)
334 (regex-quoted (subseq source x) invert))
335 (incf x length)))
336 (info "Setting bit for char ~A code ~A~%" char (char-code char))
337 (if (not (vectorp char))
338 (setf (sbit bitstring (char-code (char source x))) set-char)
339 (bit-ior bitstring char t))))))))
340 (add-exp `((let ((range ,bitstring))
341 (if (>= index length)
342 (return-from compare nil))
343 (if (= 1 (sbit range (char-code (char string index))))
344 (incf index)
345 (return-from compare nil)))))))
346 ((#\\ )
347 ;;
348 ;; Intreprete the next character as a special, range, octal, group or
349 ;; just the character itself.
350 ;;
351 (let ((length)
352 (value))
353 (multiple-value-setq (value length)
354 (regex-quoted (subseq source (1+ eindex)) nil))
355 (cond ((listp value)
356 (add-exp value))
357 ((characterp value)
358 (add-exp `((if (not (and (< index length)
359 (eql (char string index)
360 ,value)))
361 (return-from compare nil)
362 (incf index)))))
363 ((vectorp value)
364 (add-exp `((let ((range ,value))
365 (if (>= index length)
366 (return-from compare nil))
367 (if (= 1 (sbit range (char-code (char string index))))
368 (incf index)
369 (return-from compare nil)))))))
370 (incf eindex length)))
371 (t
372 ;;
373 ;; We have a literal character.
374 ;; Scan to see how many we have and if it is more than one
375 ;; generate a string= verses as single eql.
376 ;;
377 (let* ((lit "")
378 (term (dotimes (litindex (- (length source) eindex) nil)
379 (let ((litchar (char source (+ eindex litindex))))
380 (if (position litchar *regex-special-chars*)
381 (return litchar)
382 (progn
383 (info "Now adding ~A index ~A to lit~%" litchar
384 litindex)
385 (setf lit (concatenate 'string lit
386 (string litchar)))))))))
387 (if (= (length lit) 1)
388 (add-exp `((if (not (and (< index length)
389 (eql (char string index) ,current)))
390 (return-from compare nil)
391 (incf index))))
392 ;;
393 ;; If we have a multi-character literal then we must
394 ;; check to see if the next character (if there is one)
395 ;; is an astrisk or a plus or a question mark. If so then we must not use this
396 ;; character in the big literal.
397 (progn
398 (if (or (eql term #\*)
399 (eql term #\+)
400 (eql term #\?))
401 (setf lit (subseq lit 0 (1- (length lit)))))
402 (add-exp `((if (< length (+ index ,(length lit)))
403 (return-from compare nil))
404 (if (not (string= string ,lit :start1 index
405 :end1 (+ index ,(length lit))))
406 (return-from compare nil)
407 (incf index ,(length lit)))))))
408 (incf eindex (1- (length lit))))))))
409 ;;
410 ;; Plug end of list to return t. If we made it this far then
411 ;; We have matched!
412 (add-exp '((setf (cadr (aref *regex-groups* 0))
413 index)))
414 (add-exp '((return-from final-return t)))
415 ;;
416 ;;; (print expression)
417 ;;
418 ;; Now take the expression list and turn it into a lambda expression
419 ;; replacing the special flags with lisp code.
420 ;; For example: A BEGIN needs to be replace by an expression that
421 ;; saves the current index, then evaluates everything till it gets to
422 ;; the END then save the new index if it didn't fail.
423 ;; On an ASTRISK I need to take the previous expression and wrap
424 ;; it in a do that will evaluate the expression till an error
425 ;; occurs and then another do that encompases the remainder of the
426 ;; regular expression and iterates decrementing the index by one
427 ;; of the matched expression sizes and then returns nil. After
428 ;; the last expression insert a form that does a return t so that
429 ;; if the entire nested sub-expression succeeds then the loop
430 ;; is broken manually.
431 ;;
432 (setf result (copy-tree nil))
433 ;;
434 ;; Reversing the current expression makes building up the
435 ;; lambda list easier due to the nexting of expressions when
436 ;; and astrisk has been encountered.
437 (setf expression (reverse expression))
438 (do ((elt 0 (1+ elt)))
439 ((>= elt (length expression)))
440 (let ((piece (nth elt expression)))
441 ;;
442 ;; Now check for PLUS, if so then ditto the expression and then let the
443 ;; ASTRISK below handle the rest.
444 ;;
445 (cond ((eql piece 'PLUS)
446 (cond ((listp (nth (1+ elt) expression))
447 (setf result (append (list (nth (1+ elt) expression))
448 result)))
449 ;;
450 ;; duplicate the entire group
451 ;; NOTE: This hasn't been implemented yet!!
452 (t
453 (error "GROUP repeat hasn't been implemented yet~%")))))
454 (cond ((listp piece) ;Just append the list
455 (setf result (append (list piece) result)))
456 ((eql piece 'QUESTION) ; Wrap it in a block that won't fail
457 (cond ((listp (nth (1+ elt) expression))
458 (setf result
459 (append `((progn (block compare
460 ,(nth (1+ elt)
461 expression))
462 t))
463 result))
464 (incf elt))
465 ;;
466 ;; This is a QUESTION on an entire group which
467 ;; hasn't been implemented yet!!!
468 ;;
469 (t
470 (error "Optional groups not implemented yet~%"))))
471 ((or (eql piece 'ASTRISK) ; Do the wild thing!
472 (eql piece 'PLUS))
473 (cond ((listp (nth (1+ elt) expression))
474 ;;
475 ;; This is a single character wild card so
476 ;; do the simple form.
477 ;;
478 (setf result
479 `((let ((oindex index))
480 (block compare
481 (do ()
482 (nil)
483 ,(nth (1+ elt) expression)))
484 (do ((start index (1- start)))
485 ((< start oindex) nil)
486 (let ((index start))
487 (block compare
488 ,@result))))))
489 (incf elt))
490 (t
491 ;;
492 ;; This is a subgroup repeated so I must build
493 ;; the loop using several values.
494 ;;
495 ))
496 )
497 (t t)))) ; Just ignore everything else.
498 ;;
499 ;; Now wrap the result in a lambda list that can then be
500 ;; invoked or compiled, however the user wishes.
501 ;;
502 (if anchored
503 (setf result
504 `(lambda (string &key (start 0) (end (length string)))
505 (setf *regex-groupings* ,group)
506 (block final-return
507 (block compare
508 (let ((index start)
509 (length end))
510 ,@result)))))
511 (setf result
512 `(lambda (string &key (start 0) (end (length string)))
513 (setf *regex-groupings* ,group)
514 (block final-return
515 (let ((length end))
516 ,@fast-first
517 (do ((marker start (1+ marker)))
518 ((> marker end) nil)
519 (let ((index marker))
520 (if (block compare
521 ,@result)
522 (return t)))))))))))
523
524 ;; (provide 'nregex)

  ViewVC Help
Powered by ViewVC 1.1.5