/[slime]/slime/nregex.lisp
ViewVC logotype

Contents of /slime/nregex.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (show annotations)
Wed Apr 21 18:55:24 2004 UTC (9 years, 11 months ago) by heller
Branch: MAIN
imported
1 ;;;
2 ;;; This code was written by:
3 ;;;
4 ;;; Lawrence E. Freil <lef@nscf.org>
5 ;;; National Science Center Foundation
6 ;;; Augusta, Georgia 30909
7 ;;;
8 ;;; If you modify this code, please comment your modifications
9 ;;; clearly and inform the author of any improvements so they
10 ;;; can be incorporated in future releases.
11 ;;;
12 ;;; nregex.lisp - My 4/8/92 attempt at a Lisp based regular expression
13 ;;; parser.
14 ;;;
15 ;;; This regular expression parser operates by taking a
16 ;;; regular expression and breaking it down into a list
17 ;;; consisting of lisp expressions and flags. The list
18 ;;; of lisp expressions is then taken in turned into a
19 ;;; lambda expression that can be later applied to a
20 ;;; string argument for parsing.
21 ;;;;
22 ;;;; Modifications made 6 March 2001 By Chris Double (chris@double.co.nz)
23 ;;;; to get working with Corman Lisp 1.42, add package statement and export
24 ;;;; relevant functions.
25 ;;;;
26
27 (in-package :cl-user)
28
29 ;;;; CND - 6/3/2001
30 (defpackage nregex
31 (:use #:common-lisp)
32 (:export
33 #:regex
34 #:regex-compile
35 ))
36
37 ;;;; CND - 6/3/2001
38 (in-package :nregex)
39
40 ;;;
41 ;;; First we create a copy of macros to help debug the beast
42 (eval-when (:compile-toplevel :load-toplevel :execute)
43 (defvar *regex-debug* nil) ; Set to nil for no debugging code
44 )
45
46 (defmacro info (message &rest args)
47 (if *regex-debug*
48 `(format *standard-output* ,message ,@args)))
49
50 ;;;
51 ;;; Declare the global variables for storing the paren index list.
52 ;;;
53 (defvar *regex-groups* (make-array 10))
54 (defvar *regex-groupings* 0)
55
56 ;;;
57 ;;; Declare a simple interface for testing. You probably wouldn't want
58 ;;; to use this interface unless you were just calling this once.
59 ;;;
60 (defun regex (expression string)
61 "Usage: (regex <expression> <string)
62 This function will call regex-compile on the expression and then apply
63 the string to the returned lambda list."
64 (let ((findit (cond ((stringp expression)
65 (regex-compile expression))
66 ((listp expression)
67 expression)))
68 (result nil))
69 (if (not (funcall (if (functionp findit)
70 findit
71 (eval `(function ,findit))) string))
72 (return-from regex nil))
73 (if (= *regex-groupings* 0)
74 (return-from regex t))
75 (dotimes (i *regex-groupings*)
76 (push (funcall 'subseq
77 string
78 (car (aref *regex-groups* i))
79 (cadr (aref *regex-groups* i)))
80 result))
81 (reverse result)))
82
83 ;;;
84 ;;; Declare some simple macros to make the code more readable.
85 ;;;
86 (defvar *regex-special-chars* "?*+.()[]\\${}")
87
88 (defmacro add-exp (list)
89 "Add an item to the end of expression"
90 `(setf expression (append expression ,list)))
91
92 ;;;
93 ;;; Define a function that will take a quoted character and return
94 ;;; what the real character should be plus how much of the source
95 ;;; string was used. If the result is a set of characters, return an
96 ;;; array of bits indicating which characters should be set. If the
97 ;;; expression is one of the sub-group matches return a
98 ;;; list-expression that will provide the match.
99 ;;;
100 (defun regex-quoted (char-string &optional (invert nil))
101 "Usage: (regex-quoted <char-string> &optional invert)
102 Returns either the quoted character or a simple bit vector of bits set for
103 the matching values"
104 (let ((first (char char-string 0))
105 (result (char char-string 0))
106 (used-length 1))
107 (cond ((eql first #\n)
108 (setf result #\NewLine))
109 ((eql first #\c)
110 (setf result #\Return))
111 ((eql first #\t)
112 (setf result #\Tab))
113 ((eql first #\d)
114 (setf result #*0000000000000000000000000000000000000000000000001111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
115 ((eql first #\D)
116 (setf result #*1111111111111111111111111111111111111111111111110000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
117 ((eql first #\w)
118 (setf result #*0000000000000000000000000000000000000000000000001111111111000000011111111111111111111111111000010111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
119 ((eql first #\W)
120 (setf result #*1111111111111111111111111111111111111111111111110000000000111111100000000000000000000000000111101000000000000000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
121 ((eql first #\b)
122 (setf result #*0000000001000000000000000000000011000000000010100000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
123 ((eql first #\B)
124 (setf result #*1111111110111111111111111111111100111111111101011111111111011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
125 ((eql first #\s)
126 (setf result #*0000000001100000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
127 ((eql first #\S)
128 (setf result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
129 ((and (>= (char-code first) (char-code #\0))
130 (<= (char-code first) (char-code #\9)))
131 (if (and (> (length char-string) 2)
132 (and (>= (char-code (char char-string 1)) (char-code #\0))
133 (<= (char-code (char char-string 1)) (char-code #\9))
134 (>= (char-code (char char-string 2)) (char-code #\0))
135 (<= (char-code (char char-string 2)) (char-code #\9))))
136 ;;
137 ;; It is a single character specified in octal
138 ;;
139 (progn
140 (setf result (do ((x 0 (1+ x))
141 (return 0))
142 ((= x 2) return)
143 (setf return (+ (* return 8)
144 (- (char-code (char char-string x))
145 (char-code #\0))))))
146 (setf used-length 3))
147 ;;
148 ;; We have a group number replacement.
149 ;;
150 (let ((group (- (char-code first) (char-code #\0))))
151 (setf result `((let ((nstring (subseq string (car (aref *regex-groups* ,group))
152 (cadr (aref *regex-groups* ,group)))))
153 (if (< length (+ index (length nstring)))
154 (return-from compare nil))
155 (if (not (string= string nstring
156 :start1 index
157 :end1 (+ index (length nstring))))
158 (return-from compare nil)
159 (incf index (length nstring)))))))))
160 (t
161 (setf result first)))
162 (if (and (vectorp result) invert)
163 (bit-xor result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 t))
164 (values result used-length)))
165
166 ;;;
167 ;;; Now for the main regex compiler routine.
168 ;;;
169 (defun regex-compile (source &key (anchored nil))
170 "Usage: (regex-compile <expression> [ :anchored (t/nil) ])
171 This function take a regular expression (supplied as source) and
172 compiles this into a lambda list that a string argument can then
173 be applied to. It is also possible to compile this lambda list
174 for better performance or to save it as a named function for later
175 use"
176 (info "Now entering regex-compile with \"~A\"~%" source)
177 ;;
178 ;; This routine works in two parts.
179 ;; The first pass take the regular expression and produces a list of
180 ;; operators and lisp expressions for the entire regular expression.
181 ;; The second pass takes this list and produces the lambda expression.
182 (let ((expression '()) ; holder for expressions
183 (group 1) ; Current group index
184 (group-stack nil) ; Stack of current group endings
185 (result nil) ; holder for built expression.
186 (fast-first nil)) ; holder for quick unanchored scan
187 ;;
188 ;; If the expression was an empty string then it alway
189 ;; matches (so lets leave early)
190 ;;
191 (if (= (length source) 0)
192 (return-from regex-compile
193 '(lambda (&rest args)
194 (declare (ignore args))
195 t)))
196 ;;
197 ;; If the first character is a caret then set the anchored
198 ;; flags and remove if from the expression string.
199 ;;
200 (cond ((eql (char source 0) #\^)
201 (setf source (subseq source 1))
202 (setf anchored t)))
203 ;;
204 ;; If the first sequence is .* then also set the anchored flags.
205 ;; (This is purely for optimization, it will work without this).
206 ;;
207 (if (>= (length source) 2)
208 (if (string= source ".*" :start1 0 :end1 2)
209 (setf anchored t)))
210 ;;
211 ;; Also, If this is not an anchored search and the first character is
212 ;; a literal, then do a quick scan to see if it is even in the string.
213 ;; If not then we can issue a quick nil,
214 ;; otherwise we can start the search at the matching character to skip
215 ;; the checks of the non-matching characters anyway.
216 ;;
217 ;; If I really wanted to speed up this section of code it would be
218 ;; easy to recognize the case of a fairly long multi-character literal
219 ;; and generate a Boyer-Moore search for the entire literal.
220 ;;
221 ;; I generate the code to do a loop because on CMU Lisp this is about
222 ;; twice as fast a calling position.
223 ;;
224 (if (and (not anchored)
225 (not (position (char source 0) *regex-special-chars*))
226 (not (and (> (length source) 1)
227 (position (char source 1) *regex-special-chars*))))
228 (setf fast-first `((if (not (dotimes (i length nil)
229 (if (eql (char string i)
230 ,(char source 0))
231 (return (setf start i)))))
232 (return-from final-return nil)))))
233 ;;
234 ;; Generate the very first expression to save the starting index
235 ;; so that group 0 will be the entire string matched always
236 ;;
237 (add-exp '((setf (aref *regex-groups* 0)
238 (list index nil))))
239 ;;
240 ;; Loop over each character in the regular expression building the
241 ;; expression list as we go.
242 ;;
243 (do ((eindex 0 (1+ eindex)))
244 ((= eindex (length source)))
245 (let ((current (char source eindex)))
246 (info "Now processing character ~A index = ~A~%" current eindex)
247 (case current
248 ((#\.)
249 ;;
250 ;; Generate code for a single wild character
251 ;;
252 (add-exp '((if (>= index length)
253 (return-from compare nil)
254 (incf index)))))
255 ((#\$)
256 ;;
257 ;; If this is the last character of the expression then
258 ;; anchor the end of the expression, otherwise let it slide
259 u ;; as a standard character (even though it should be quoted).
260 ;;
261 (if (= eindex (1- (length source)))
262 (add-exp '((if (not (= index length))
263 (return-from compare nil))))
264 (add-exp '((if (not (and (< index length)
265 (eql (char string index) #\$)))
266 (return-from compare nil)
267 (incf index))))))
268 ((#\*)
269 (add-exp '(ASTRISK)))
270
271 ((#\+)
272 (add-exp '(PLUS)))
273
274 ((#\?)
275 (add-exp '(QUESTION)))
276
277 ((#\()
278 ;;
279 ;; Start a grouping.
280 ;;
281 (incf group)
282 (push group group-stack)
283 (add-exp `((setf (aref *regex-groups* ,(1- group))
284 (list index nil))))
285 (add-exp `(,group)))
286 ((#\))
287 ;;
288 ;; End a grouping
289 ;;
290 (let ((group (pop group-stack)))
291 (add-exp `((setf (cadr (aref *regex-groups* ,(1- group)))
292 index)))
293 (add-exp `(,(- group)))))
294 ((#\[)
295 ;;
296 ;; Start of a range operation.
297 ;; Generate a bit-vector that has one bit per possible character
298 ;; and then on each character or range, set the possible bits.
299 ;;
300 ;; If the first character is carat then invert the set.
301 (let* ((invert (eql (char source (1+ eindex)) #\^))
302 (bitstring (make-array 256 :element-type 'bit
303 :initial-element
304 (if invert 1 0)))
305 (set-char (if invert 0 1)))
306 (if invert (incf eindex))
307 (do ((x (1+ eindex) (1+ x)))
308 ((eql (char source x) #\]) (setf eindex x))
309 (info "Building range with character ~A~%" (char source x))
310 (cond ((and (eql (char source (1+ x)) #\-)
311 (not (eql (char source (+ x 2)) #\])))
312 (if (>= (char-code (char source x))
313 (char-code (char source (+ 2 x))))
314 (error "Invalid range \"~A-~A\". Ranges must be in acending order"
315 (char source x) (char source (+ 2 x))))
316 (do ((j (char-code (char source x)) (1+ j)))
317 ((> j (char-code (char source (+ 2 x))))
318 (incf x 2))
319 (info "Setting bit for char ~A code ~A~%" (code-char j) j)
320 (setf (sbit bitstring j) set-char)))
321 (t
322 (cond ((not (eql (char source x) #\]))
323 (let ((char (char source x)))
324 ;;
325 ;; If the character is quoted then find out what
326 ;; it should have been
327 ;;
328 (if (eql (char source x) #\\ )
329 (let ((length))
330 (multiple-value-setq (char length)
331 (regex-quoted (subseq source x) invert))
332 (incf x length)))
333 (info "Setting bit for char ~A code ~A~%" char (char-code char))
334 (if (not (vectorp char))
335 (setf (sbit bitstring (char-code (char source x))) set-char)
336 (bit-ior bitstring char t))))))))
337 (add-exp `((let ((range ,bitstring))
338 (if (>= index length)
339 (return-from compare nil))
340 (if (= 1 (sbit range (char-code (char string index))))
341 (incf index)
342 (return-from compare nil)))))))
343 ((#\\ )
344 ;;
345 ;; Intreprete the next character as a special, range, octal, group or
346 ;; just the character itself.
347 ;;
348 (let ((length)
349 (value))
350 (multiple-value-setq (value length)
351 (regex-quoted (subseq source (1+ eindex)) nil))
352 (cond ((listp value)
353 (add-exp value))
354 ((characterp value)
355 (add-exp `((if (not (and (< index length)
356 (eql (char string index)
357 ,value)))
358 (return-from compare nil)
359 (incf index)))))
360 ((vectorp value)
361 (add-exp `((let ((range ,value))
362 (if (>= index length)
363 (return-from compare nil))
364 (if (= 1 (sbit range (char-code (char string index))))
365 (incf index)
366 (return-from compare nil)))))))
367 (incf eindex length)))
368 (t
369 ;;
370 ;; We have a literal character.
371 ;; Scan to see how many we have and if it is more than one
372 ;; generate a string= verses as single eql.
373 ;;
374 (let* ((lit "")
375 (term (dotimes (litindex (- (length source) eindex) nil)
376 (let ((litchar (char source (+ eindex litindex))))
377 (if (position litchar *regex-special-chars*)
378 (return litchar)
379 (progn
380 (info "Now adding ~A index ~A to lit~%" litchar
381 litindex)
382 (setf lit (concatenate 'string lit
383 (string litchar)))))))))
384 (if (= (length lit) 1)
385 (add-exp `((if (not (and (< index length)
386 (eql (char string index) ,current)))
387 (return-from compare nil)
388 (incf index))))
389 ;;
390 ;; If we have a multi-character literal then we must
391 ;; check to see if the next character (if there is one)
392 ;; is an astrisk or a plus or a question mark. If so then we must not use this
393 ;; character in the big literal.
394 (progn
395 (if (or (eql term #\*)
396 (eql term #\+)
397 (eql term #\?))
398 (setf lit (subseq lit 0 (1- (length lit)))))
399 (add-exp `((if (< length (+ index ,(length lit)))
400 (return-from compare nil))
401 (if (not (string= string ,lit :start1 index
402 :end1 (+ index ,(length lit))))
403 (return-from compare nil)
404 (incf index ,(length lit)))))))
405 (incf eindex (1- (length lit))))))))
406 ;;
407 ;; Plug end of list to return t. If we made it this far then
408 ;; We have matched!
409 (add-exp '((setf (cadr (aref *regex-groups* 0))
410 index)))
411 (add-exp '((return-from final-return t)))
412 ;;
413 ;;; (print expression)
414 ;;
415 ;; Now take the expression list and turn it into a lambda expression
416 ;; replacing the special flags with lisp code.
417 ;; For example: A BEGIN needs to be replace by an expression that
418 ;; saves the current index, then evaluates everything till it gets to
419 ;; the END then save the new index if it didn't fail.
420 ;; On an ASTRISK I need to take the previous expression and wrap
421 ;; it in a do that will evaluate the expression till an error
422 ;; occurs and then another do that encompases the remainder of the
423 ;; regular expression and iterates decrementing the index by one
424 ;; of the matched expression sizes and then returns nil. After
425 ;; the last expression insert a form that does a return t so that
426 ;; if the entire nested sub-expression succeeds then the loop
427 ;; is broken manually.
428 ;;
429 (setf result (copy-tree nil))
430 ;;
431 ;; Reversing the current expression makes building up the
432 ;; lambda list easier due to the nexting of expressions when
433 ;; and astrisk has been encountered.
434 (setf expression (reverse expression))
435 (do ((elt 0 (1+ elt)))
436 ((>= elt (length expression)))
437 (let ((piece (nth elt expression)))
438 ;;
439 ;; Now check for PLUS, if so then ditto the expression and then let the
440 ;; ASTRISK below handle the rest.
441 ;;
442 (cond ((eql piece 'PLUS)
443 (cond ((listp (nth (1+ elt) expression))
444 (setf result (append (list (nth (1+ elt) expression))
445 result)))
446 ;;
447 ;; duplicate the entire group
448 ;; NOTE: This hasn't been implemented yet!!
449 (t
450 (error "GROUP repeat hasn't been implemented yet~%")))))
451 (cond ((listp piece) ;Just append the list
452 (setf result (append (list piece) result)))
453 ((eql piece 'QUESTION) ; Wrap it in a block that won't fail
454 (cond ((listp (nth (1+ elt) expression))
455 (setf result
456 (append `((progn (block compare
457 ,(nth (1+ elt)
458 expression))
459 t))
460 result))
461 (incf elt))
462 ;;
463 ;; This is a QUESTION on an entire group which
464 ;; hasn't been implemented yet!!!
465 ;;
466 (t
467 (error "Optional groups not implemented yet~%"))))
468 ((or (eql piece 'ASTRISK) ; Do the wild thing!
469 (eql piece 'PLUS))
470 (cond ((listp (nth (1+ elt) expression))
471 ;;
472 ;; This is a single character wild card so
473 ;; do the simple form.
474 ;;
475 (setf result
476 `((let ((oindex index))
477 (block compare
478 (do ()
479 (nil)
480 ,(nth (1+ elt) expression)))
481 (do ((start index (1- start)))
482 ((< start oindex) nil)
483 (let ((index start))
484 (block compare
485 ,@result))))))
486 (incf elt))
487 (t
488 ;;
489 ;; This is a subgroup repeated so I must build
490 ;; the loop using several values.
491 ;;
492 ))
493 )
494 (t t)))) ; Just ignore everything else.
495 ;;
496 ;; Now wrap the result in a lambda list that can then be
497 ;; invoked or compiled, however the user wishes.
498 ;;
499 (if anchored
500 (setf result
501 `(lambda (string &key (start 0) (end (length string)))
502 (setf *regex-groupings* ,group)
503 (block final-return
504 (block compare
505 (let ((index start)
506 (length end))
507 ,@result)))))
508 (setf result
509 `(lambda (string &key (start 0) (end (length string)))
510 (setf *regex-groupings* ,group)
511 (block final-return
512 (let ((length end))
513 ,@fast-first
514 (do ((marker start (1+ marker)))
515 ((> marker end) nil)
516 (let ((index marker))
517 (if (block compare
518 ,@result)
519 (return t)))))))))))
520
521 ;; (provide 'nregex)

  ViewVC Help
Powered by ViewVC 1.1.5