/[slime]/slime/nregex.lisp
ViewVC logotype

Contents of /slime/nregex.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (show annotations)
Mon Sep 19 08:20:48 2005 UTC (8 years, 7 months ago) by lgorrie
Branch: MAIN
CVS Tags: SLIME-2-3, SLIME-2-2, SLIME-2-1, SLIME-2-0, SLIME-1-3, FAIRLY-STABLE, byte-stream, HEAD
Branch point for: fsm, contrib
Changes since 1.3: +3 -4 lines
Released into the public domain by Lawrence E. Freil.
1 ;;;
2 ;;; This code was written by:
3 ;;;
4 ;;; Lawrence E. Freil <lef@freil.com>
5 ;;; National Science Center Foundation
6 ;;; Augusta, Georgia 30909
7 ;;;
8 ;;; This program was released into the public domain on 2005-08-31.
9 ;;; (See the slime-devel mailing list archive for details.)
10 ;;;
11 ;;; nregex.lisp - My 4/8/92 attempt at a Lisp based regular expression
12 ;;; parser.
13 ;;;
14 ;;; This regular expression parser operates by taking a
15 ;;; regular expression and breaking it down into a list
16 ;;; consisting of lisp expressions and flags. The list
17 ;;; of lisp expressions is then taken in turned into a
18 ;;; lambda expression that can be later applied to a
19 ;;; string argument for parsing.
20 ;;;;
21 ;;;; Modifications made 6 March 2001 By Chris Double (chris@double.co.nz)
22 ;;;; to get working with Corman Lisp 1.42, add package statement and export
23 ;;;; relevant functions.
24 ;;;;
25
26 (in-package :cl-user)
27
28 ;; Renamed to slime-nregex avoid name clashes with other versions of
29 ;; this file. -- he
30
31 ;;;; CND - 6/3/2001
32 (defpackage slime-nregex
33 (:use #:common-lisp)
34 (:export
35 #:regex
36 #:regex-compile
37 ))
38
39 ;;;; CND - 6/3/2001
40 (in-package :slime-nregex)
41
42 ;;;
43 ;;; First we create a copy of macros to help debug the beast
44 (eval-when (:compile-toplevel :load-toplevel :execute)
45 (defvar *regex-debug* nil) ; Set to nil for no debugging code
46 )
47
48 (defmacro info (message &rest args)
49 (if *regex-debug*
50 `(format *standard-output* ,message ,@args)))
51
52 ;;;
53 ;;; Declare the global variables for storing the paren index list.
54 ;;;
55 (defvar *regex-groups* (make-array 10))
56 (defvar *regex-groupings* 0)
57
58 ;;;
59 ;;; Declare a simple interface for testing. You probably wouldn't want
60 ;;; to use this interface unless you were just calling this once.
61 ;;;
62 (defun regex (expression string)
63 "Usage: (regex <expression> <string)
64 This function will call regex-compile on the expression and then apply
65 the string to the returned lambda list."
66 (let ((findit (cond ((stringp expression)
67 (regex-compile expression))
68 ((listp expression)
69 expression)))
70 (result nil))
71 (if (not (funcall (if (functionp findit)
72 findit
73 (eval `(function ,findit))) string))
74 (return-from regex nil))
75 (if (= *regex-groupings* 0)
76 (return-from regex t))
77 (dotimes (i *regex-groupings*)
78 (push (funcall 'subseq
79 string
80 (car (aref *regex-groups* i))
81 (cadr (aref *regex-groups* i)))
82 result))
83 (reverse result)))
84
85 ;;;
86 ;;; Declare some simple macros to make the code more readable.
87 ;;;
88 (defvar *regex-special-chars* "?*+.()[]\\${}")
89
90 (defmacro add-exp (list)
91 "Add an item to the end of expression"
92 `(setf expression (append expression ,list)))
93
94 ;;;
95 ;;; Define a function that will take a quoted character and return
96 ;;; what the real character should be plus how much of the source
97 ;;; string was used. If the result is a set of characters, return an
98 ;;; array of bits indicating which characters should be set. If the
99 ;;; expression is one of the sub-group matches return a
100 ;;; list-expression that will provide the match.
101 ;;;
102 (defun regex-quoted (char-string &optional (invert nil))
103 "Usage: (regex-quoted <char-string> &optional invert)
104 Returns either the quoted character or a simple bit vector of bits set for
105 the matching values"
106 (let ((first (char char-string 0))
107 (result (char char-string 0))
108 (used-length 1))
109 (cond ((eql first #\n)
110 (setf result #\NewLine))
111 ((eql first #\c)
112 (setf result #\Return))
113 ((eql first #\t)
114 (setf result #\Tab))
115 ((eql first #\d)
116 (setf result #*0000000000000000000000000000000000000000000000001111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
117 ((eql first #\D)
118 (setf result #*1111111111111111111111111111111111111111111111110000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
119 ((eql first #\w)
120 (setf result #*0000000000000000000000000000000000000000000000001111111111000000011111111111111111111111111000010111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
121 ((eql first #\W)
122 (setf result #*1111111111111111111111111111111111111111111111110000000000111111100000000000000000000000000111101000000000000000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
123 ((eql first #\b)
124 (setf result #*0000000001000000000000000000000011000000000010100000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
125 ((eql first #\B)
126 (setf result #*1111111110111111111111111111111100111111111101011111111111011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
127 ((eql first #\s)
128 (setf result #*0000000001100000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
129 ((eql first #\S)
130 (setf result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
131 ((and (>= (char-code first) (char-code #\0))
132 (<= (char-code first) (char-code #\9)))
133 (if (and (> (length char-string) 2)
134 (and (>= (char-code (char char-string 1)) (char-code #\0))
135 (<= (char-code (char char-string 1)) (char-code #\9))
136 (>= (char-code (char char-string 2)) (char-code #\0))
137 (<= (char-code (char char-string 2)) (char-code #\9))))
138 ;;
139 ;; It is a single character specified in octal
140 ;;
141 (progn
142 (setf result (do ((x 0 (1+ x))
143 (return 0))
144 ((= x 2) return)
145 (setf return (+ (* return 8)
146 (- (char-code (char char-string x))
147 (char-code #\0))))))
148 (setf used-length 3))
149 ;;
150 ;; We have a group number replacement.
151 ;;
152 (let ((group (- (char-code first) (char-code #\0))))
153 (setf result `((let ((nstring (subseq string (car (aref *regex-groups* ,group))
154 (cadr (aref *regex-groups* ,group)))))
155 (if (< length (+ index (length nstring)))
156 (return-from compare nil))
157 (if (not (string= string nstring
158 :start1 index
159 :end1 (+ index (length nstring))))
160 (return-from compare nil)
161 (incf index (length nstring)))))))))
162 (t
163 (setf result first)))
164 (if (and (vectorp result) invert)
165 (bit-xor result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 t))
166 (values result used-length)))
167
168 ;;;
169 ;;; Now for the main regex compiler routine.
170 ;;;
171 (defun regex-compile (source &key (anchored nil))
172 "Usage: (regex-compile <expression> [ :anchored (t/nil) ])
173 This function take a regular expression (supplied as source) and
174 compiles this into a lambda list that a string argument can then
175 be applied to. It is also possible to compile this lambda list
176 for better performance or to save it as a named function for later
177 use"
178 (info "Now entering regex-compile with \"~A\"~%" source)
179 ;;
180 ;; This routine works in two parts.
181 ;; The first pass take the regular expression and produces a list of
182 ;; operators and lisp expressions for the entire regular expression.
183 ;; The second pass takes this list and produces the lambda expression.
184 (let ((expression '()) ; holder for expressions
185 (group 1) ; Current group index
186 (group-stack nil) ; Stack of current group endings
187 (result nil) ; holder for built expression.
188 (fast-first nil)) ; holder for quick unanchored scan
189 ;;
190 ;; If the expression was an empty string then it alway
191 ;; matches (so lets leave early)
192 ;;
193 (if (= (length source) 0)
194 (return-from regex-compile
195 '(lambda (&rest args)
196 (declare (ignore args))
197 t)))
198 ;;
199 ;; If the first character is a caret then set the anchored
200 ;; flags and remove if from the expression string.
201 ;;
202 (cond ((eql (char source 0) #\^)
203 (setf source (subseq source 1))
204 (setf anchored t)))
205 ;;
206 ;; If the first sequence is .* then also set the anchored flags.
207 ;; (This is purely for optimization, it will work without this).
208 ;;
209 (if (>= (length source) 2)
210 (if (string= source ".*" :start1 0 :end1 2)
211 (setf anchored t)))
212 ;;
213 ;; Also, If this is not an anchored search and the first character is
214 ;; a literal, then do a quick scan to see if it is even in the string.
215 ;; If not then we can issue a quick nil,
216 ;; otherwise we can start the search at the matching character to skip
217 ;; the checks of the non-matching characters anyway.
218 ;;
219 ;; If I really wanted to speed up this section of code it would be
220 ;; easy to recognize the case of a fairly long multi-character literal
221 ;; and generate a Boyer-Moore search for the entire literal.
222 ;;
223 ;; I generate the code to do a loop because on CMU Lisp this is about
224 ;; twice as fast a calling position.
225 ;;
226 (if (and (not anchored)
227 (not (position (char source 0) *regex-special-chars*))
228 (not (and (> (length source) 1)
229 (position (char source 1) *regex-special-chars*))))
230 (setf fast-first `((if (not (dotimes (i length nil)
231 (if (eql (char string i)
232 ,(char source 0))
233 (return (setf start i)))))
234 (return-from final-return nil)))))
235 ;;
236 ;; Generate the very first expression to save the starting index
237 ;; so that group 0 will be the entire string matched always
238 ;;
239 (add-exp '((setf (aref *regex-groups* 0)
240 (list index nil))))
241 ;;
242 ;; Loop over each character in the regular expression building the
243 ;; expression list as we go.
244 ;;
245 (do ((eindex 0 (1+ eindex)))
246 ((= eindex (length source)))
247 (let ((current (char source eindex)))
248 (info "Now processing character ~A index = ~A~%" current eindex)
249 (case current
250 ((#\.)
251 ;;
252 ;; Generate code for a single wild character
253 ;;
254 (add-exp '((if (>= index length)
255 (return-from compare nil)
256 (incf index)))))
257 ((#\$)
258 ;;
259 ;; If this is the last character of the expression then
260 ;; anchor the end of the expression, otherwise let it slide
261 ;; as a standard character (even though it should be quoted).
262 ;;
263 (if (= eindex (1- (length source)))
264 (add-exp '((if (not (= index length))
265 (return-from compare nil))))
266 (add-exp '((if (not (and (< index length)
267 (eql (char string index) #\$)))
268 (return-from compare nil)
269 (incf index))))))
270 ((#\*)
271 (add-exp '(ASTRISK)))
272
273 ((#\+)
274 (add-exp '(PLUS)))
275
276 ((#\?)
277 (add-exp '(QUESTION)))
278
279 ((#\()
280 ;;
281 ;; Start a grouping.
282 ;;
283 (incf group)
284 (push group group-stack)
285 (add-exp `((setf (aref *regex-groups* ,(1- group))
286 (list index nil))))
287 (add-exp `(,group)))
288 ((#\))
289 ;;
290 ;; End a grouping
291 ;;
292 (let ((group (pop group-stack)))
293 (add-exp `((setf (cadr (aref *regex-groups* ,(1- group)))
294 index)))
295 (add-exp `(,(- group)))))
296 ((#\[)
297 ;;
298 ;; Start of a range operation.
299 ;; Generate a bit-vector that has one bit per possible character
300 ;; and then on each character or range, set the possible bits.
301 ;;
302 ;; If the first character is carat then invert the set.
303 (let* ((invert (eql (char source (1+ eindex)) #\^))
304 (bitstring (make-array 256 :element-type 'bit
305 :initial-element
306 (if invert 1 0)))
307 (set-char (if invert 0 1)))
308 (if invert (incf eindex))
309 (do ((x (1+ eindex) (1+ x)))
310 ((eql (char source x) #\]) (setf eindex x))
311 (info "Building range with character ~A~%" (char source x))
312 (cond ((and (eql (char source (1+ x)) #\-)
313 (not (eql (char source (+ x 2)) #\])))
314 (if (>= (char-code (char source x))
315 (char-code (char source (+ 2 x))))
316 (error "Invalid range \"~A-~A\". Ranges must be in acending order"
317 (char source x) (char source (+ 2 x))))
318 (do ((j (char-code (char source x)) (1+ j)))
319 ((> j (char-code (char source (+ 2 x))))
320 (incf x 2))
321 (info "Setting bit for char ~A code ~A~%" (code-char j) j)
322 (setf (sbit bitstring j) set-char)))
323 (t
324 (cond ((not (eql (char source x) #\]))
325 (let ((char (char source x)))
326 ;;
327 ;; If the character is quoted then find out what
328 ;; it should have been
329 ;;
330 (if (eql (char source x) #\\ )
331 (let ((length))
332 (multiple-value-setq (char length)
333 (regex-quoted (subseq source x) invert))
334 (incf x length)))
335 (info "Setting bit for char ~A code ~A~%" char (char-code char))
336 (if (not (vectorp char))
337 (setf (sbit bitstring (char-code (char source x))) set-char)
338 (bit-ior bitstring char t))))))))
339 (add-exp `((let ((range ,bitstring))
340 (if (>= index length)
341 (return-from compare nil))
342 (if (= 1 (sbit range (char-code (char string index))))
343 (incf index)
344 (return-from compare nil)))))))
345 ((#\\ )
346 ;;
347 ;; Intreprete the next character as a special, range, octal, group or
348 ;; just the character itself.
349 ;;
350 (let ((length)
351 (value))
352 (multiple-value-setq (value length)
353 (regex-quoted (subseq source (1+ eindex)) nil))
354 (cond ((listp value)
355 (add-exp value))
356 ((characterp value)
357 (add-exp `((if (not (and (< index length)
358 (eql (char string index)
359 ,value)))
360 (return-from compare nil)
361 (incf index)))))
362 ((vectorp value)
363 (add-exp `((let ((range ,value))
364 (if (>= index length)
365 (return-from compare nil))
366 (if (= 1 (sbit range (char-code (char string index))))
367 (incf index)
368 (return-from compare nil)))))))
369 (incf eindex length)))
370 (t
371 ;;
372 ;; We have a literal character.
373 ;; Scan to see how many we have and if it is more than one
374 ;; generate a string= verses as single eql.
375 ;;
376 (let* ((lit "")
377 (term (dotimes (litindex (- (length source) eindex) nil)
378 (let ((litchar (char source (+ eindex litindex))))
379 (if (position litchar *regex-special-chars*)
380 (return litchar)
381 (progn
382 (info "Now adding ~A index ~A to lit~%" litchar
383 litindex)
384 (setf lit (concatenate 'string lit
385 (string litchar)))))))))
386 (if (= (length lit) 1)
387 (add-exp `((if (not (and (< index length)
388 (eql (char string index) ,current)))
389 (return-from compare nil)
390 (incf index))))
391 ;;
392 ;; If we have a multi-character literal then we must
393 ;; check to see if the next character (if there is one)
394 ;; is an astrisk or a plus or a question mark. If so then we must not use this
395 ;; character in the big literal.
396 (progn
397 (if (or (eql term #\*)
398 (eql term #\+)
399 (eql term #\?))
400 (setf lit (subseq lit 0 (1- (length lit)))))
401 (add-exp `((if (< length (+ index ,(length lit)))
402 (return-from compare nil))
403 (if (not (string= string ,lit :start1 index
404 :end1 (+ index ,(length lit))))
405 (return-from compare nil)
406 (incf index ,(length lit)))))))
407 (incf eindex (1- (length lit))))))))
408 ;;
409 ;; Plug end of list to return t. If we made it this far then
410 ;; We have matched!
411 (add-exp '((setf (cadr (aref *regex-groups* 0))
412 index)))
413 (add-exp '((return-from final-return t)))
414 ;;
415 ;;; (print expression)
416 ;;
417 ;; Now take the expression list and turn it into a lambda expression
418 ;; replacing the special flags with lisp code.
419 ;; For example: A BEGIN needs to be replace by an expression that
420 ;; saves the current index, then evaluates everything till it gets to
421 ;; the END then save the new index if it didn't fail.
422 ;; On an ASTRISK I need to take the previous expression and wrap
423 ;; it in a do that will evaluate the expression till an error
424 ;; occurs and then another do that encompases the remainder of the
425 ;; regular expression and iterates decrementing the index by one
426 ;; of the matched expression sizes and then returns nil. After
427 ;; the last expression insert a form that does a return t so that
428 ;; if the entire nested sub-expression succeeds then the loop
429 ;; is broken manually.
430 ;;
431 (setf result (copy-tree nil))
432 ;;
433 ;; Reversing the current expression makes building up the
434 ;; lambda list easier due to the nexting of expressions when
435 ;; and astrisk has been encountered.
436 (setf expression (reverse expression))
437 (do ((elt 0 (1+ elt)))
438 ((>= elt (length expression)))
439 (let ((piece (nth elt expression)))
440 ;;
441 ;; Now check for PLUS, if so then ditto the expression and then let the
442 ;; ASTRISK below handle the rest.
443 ;;
444 (cond ((eql piece 'PLUS)
445 (cond ((listp (nth (1+ elt) expression))
446 (setf result (append (list (nth (1+ elt) expression))
447 result)))
448 ;;
449 ;; duplicate the entire group
450 ;; NOTE: This hasn't been implemented yet!!
451 (t
452 (error "GROUP repeat hasn't been implemented yet~%")))))
453 (cond ((listp piece) ;Just append the list
454 (setf result (append (list piece) result)))
455 ((eql piece 'QUESTION) ; Wrap it in a block that won't fail
456 (cond ((listp (nth (1+ elt) expression))
457 (setf result
458 (append `((progn (block compare
459 ,(nth (1+ elt)
460 expression))
461 t))
462 result))
463 (incf elt))
464 ;;
465 ;; This is a QUESTION on an entire group which
466 ;; hasn't been implemented yet!!!
467 ;;
468 (t
469 (error "Optional groups not implemented yet~%"))))
470 ((or (eql piece 'ASTRISK) ; Do the wild thing!
471 (eql piece 'PLUS))
472 (cond ((listp (nth (1+ elt) expression))
473 ;;
474 ;; This is a single character wild card so
475 ;; do the simple form.
476 ;;
477 (setf result
478 `((let ((oindex index))
479 (block compare
480 (do ()
481 (nil)
482 ,(nth (1+ elt) expression)))
483 (do ((start index (1- start)))
484 ((< start oindex) nil)
485 (let ((index start))
486 (block compare
487 ,@result))))))
488 (incf elt))
489 (t
490 ;;
491 ;; This is a subgroup repeated so I must build
492 ;; the loop using several values.
493 ;;
494 ))
495 )
496 (t t)))) ; Just ignore everything else.
497 ;;
498 ;; Now wrap the result in a lambda list that can then be
499 ;; invoked or compiled, however the user wishes.
500 ;;
501 (if anchored
502 (setf result
503 `(lambda (string &key (start 0) (end (length string)))
504 (setf *regex-groupings* ,group)
505 (block final-return
506 (block compare
507 (let ((index start)
508 (length end))
509 ,@result)))))
510 (setf result
511 `(lambda (string &key (start 0) (end (length string)))
512 (setf *regex-groupings* ,group)
513 (block final-return
514 (let ((length end))
515 ,@fast-first
516 (do ((marker start (1+ marker)))
517 ((> marker end) nil)
518 (let ((index marker))
519 (if (block compare
520 ,@result)
521 (return t)))))))))))
522
523 ;; (provide 'nregex)

  ViewVC Help
Powered by ViewVC 1.1.5