;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- ;;; $Header: /usr/local/cvsrep/cl-ppcre/scanner.lisp,v 1.36 2009/09/17 19:17:31 edi Exp $ ;;; Here the scanner for the actual regex as well as utility scanners ;;; for the constant start and end strings are created. ;;; Copyright (c) 2002-2009, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions ;;; are met: ;;; * Redistributions of source code must retain the above copyright ;;; notice, this list of conditions and the following disclaimer. ;;; * Redistributions in binary form must reproduce the above ;;; copyright notice, this list of conditions and the following ;;; disclaimer in the documentation and/or other materials ;;; provided with the distribution. ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (in-package :cl-ppcre) (defmacro bmh-matcher-aux (&key case-insensitive-p) "Auxiliary macro used by CREATE-BMH-MATCHER." (let ((char-compare (if case-insensitive-p 'char-equal 'char=))) `(lambda (start-pos) (declare (fixnum start-pos)) (if (or (minusp start-pos) (> (the fixnum (+ start-pos m)) *end-pos*)) nil (loop named bmh-matcher for k of-type fixnum = (+ start-pos m -1) then (+ k (max 1 (aref skip (char-code (schar *string* k))))) while (< k *end-pos*) do (loop for j of-type fixnum downfrom (1- m) for i of-type fixnum downfrom k while (and (>= j 0) (,char-compare (schar *string* i) (schar pattern j))) finally (if (minusp j) (return-from bmh-matcher (1+ i))))))))) (defun create-bmh-matcher (pattern case-insensitive-p) "Returns a Boyer-Moore-Horspool matcher which searches the (special) simple-string *STRING* for the first occurence of the substring PATTERN. The search starts at the position START-POS within *STRING* and stops before *END-POS* is reached. Depending on the second argument the search is case-insensitive or not. If the special variable *USE-BMH-MATCHERS* is NIL, use the standard SEARCH function instead. \(BMH matchers are faster but need much more space.)" (declare #.*standard-optimize-settings*) ;; see for ;; details (unless *use-bmh-matchers* (let ((test (if case-insensitive-p #'char-equal #'char=))) (return-from create-bmh-matcher (lambda (start-pos) (declare (fixnum start-pos)) (and (not (minusp start-pos)) (search pattern *string* :start2 start-pos :end2 *end-pos* :test test)))))) (let* ((m (length pattern)) (skip (make-array *regex-char-code-limit* :element-type 'fixnum :initial-element m))) (declare (fixnum m)) (loop for k of-type fixnum below m if case-insensitive-p do (setf (aref skip (char-code (char-upcase (schar pattern k)))) (- m k 1) (aref skip (char-code (char-downcase (schar pattern k)))) (- m k 1)) else do (setf (aref skip (char-code (schar pattern k))) (- m k 1))) (if case-insensitive-p (bmh-matcher-aux :case-insensitive-p t) (bmh-matcher-aux)))) (defmacro char-searcher-aux (&key case-insensitive-p) "Auxiliary macro used by CREATE-CHAR-SEARCHER." (let ((char-compare (if case-insensitive-p 'char-equal 'char=))) `(lambda (start-pos) (declare (fixnum start-pos)) (and (not (minusp start-pos)) (loop for i of-type fixnum from start-pos below *end-pos* thereis (and (,char-compare (schar *string* i) chr) i)))))) (defun create-char-searcher (chr case-insensitive-p) "Returns a function which searches the (special) simple-string *STRING* for the first occurence of the character CHR. The search starts at the position START-POS within *STRING* and stops before *END-POS* is reached. Depending on the second argument the search is case-insensitive or not." (declare #.*standard-optimize-settings*) (if case-insensitive-p (char-searcher-aux :case-insensitive-p t) (char-searcher-aux))) (declaim (inline newline-skipper)) (defun newline-skipper (start-pos) "Finds the next occurence of a character in *STRING* which is behind a #\Newline." (declare #.*standard-optimize-settings*) (declare (fixnum start-pos)) ;; we can start with (1- START-POS) without testing for (PLUSP ;; START-POS) because we know we'll never call NEWLINE-SKIPPER on ;; the first iteration (loop for i of-type fixnum from (1- start-pos) below *end-pos* thereis (and (char= (schar *string* i) #\Newline) (1+ i)))) (defmacro insert-advance-fn (advance-fn) "Creates the actual closure returned by CREATE-SCANNER-AUX by replacing '(ADVANCE-FN-DEFINITION) with a suitable definition for ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX." (subst advance-fn '(advance-fn-definition) '(lambda (string start end) (block scan ;; initialize a couple of special variables used by the ;; matchers (see file specials.lisp) (let* ((*string* string) (*start-pos* start) (*end-pos* end) ;; we will search forward for END-STRING if this value ;; isn't at least as big as POS (see ADVANCE-FN), so it ;; is safe to start to the left of *START-POS*; note ;; that this value will _never_ be decremented - this ;; is crucial to the scanning process (*end-string-pos* (1- *start-pos*)) ;; the next five will shadow the variables defined by ;; DEFPARAMETER; at this point, we don't know if we'll ;; actually use them, though (*repeat-counters* *repeat-counters*) (*last-pos-stores* *last-pos-stores*) (*reg-starts* *reg-starts*) (*regs-maybe-start* *regs-maybe-start*) (*reg-ends* *reg-ends*) ;; we might be able to optimize the scanning process by ;; (virtually) shifting *START-POS* to the right (scan-start-pos *start-pos*) (starts-with-str (if start-string-test (str starts-with) nil)) ;; we don't need to try further than MAX-END-POS (max-end-pos (- *end-pos* min-len))) (declare (fixnum scan-start-pos) (function match-fn)) ;; definition of ADVANCE-FN will be inserted here by macrology (labels ((advance-fn-definition)) (declare (inline advance-fn)) (when (plusp rep-num) ;; we have at least one REPETITION which needs to count ;; the number of repetitions (setq *repeat-counters* (make-array rep-num :initial-element 0 :element-type 'fixnum))) (when (plusp zero-length-num) ;; we have at least one REPETITION which needs to watch ;; out for zero-length repetitions (setq *last-pos-stores* (make-array zero-length-num :initial-element nil))) (when (plusp reg-num) ;; we have registers in our regular expression (setq *reg-starts* (make-array reg-num :initial-element nil) *regs-maybe-start* (make-array reg-num :initial-element nil) *reg-ends* (make-array reg-num :initial-element nil))) (when end-anchored-p ;; the regular expression has a constant end string which ;; is anchored at the very end of the target string ;; (perhaps modulo a #\Newline) (let ((end-test-pos (- *end-pos* (the fixnum end-string-len)))) (declare (fixnum end-test-pos) (function end-string-test)) (unless (setq *end-string-pos* (funcall end-string-test end-test-pos)) (when (and (= 1 (the fixnum end-anchored-p)) (> *end-pos* scan-start-pos) (char= #\Newline (schar *string* (1- *end-pos*)))) ;; if we didn't find an end string candidate from ;; END-TEST-POS and if a #\Newline at the end is ;; allowed we try it again from one position to the ;; left (setq *end-string-pos* (funcall end-string-test (1- end-test-pos)))))) (unless (and *end-string-pos* (<= *start-pos* *end-string-pos*)) ;; no end string candidate found, so give up (return-from scan nil)) (when end-string-offset ;; if the offset of the constant end string from the ;; left of the regular expression is known we can start ;; scanning further to the right; this is similar to ;; what we might do in ADVANCE-FN (setq scan-start-pos (max scan-start-pos (- (the fixnum *end-string-pos*) (the fixnum end-string-offset)))))) (cond (start-anchored-p ;; we're anchored at the start of the target string, ;; so no need to try again after first failure (when (or (/= *start-pos* scan-start-pos) (< max-end-pos *start-pos*)) ;; if END-STRING-OFFSET has proven that we don't ;; need to bother to scan from *START-POS* or if the ;; minimal length of the regular expression is ;; longer than the target string we give up (return-from scan nil)) (when starts-with-str (locally (declare (fixnum starts-with-len)) (cond ((and (case-insensitive-p starts-with) (not (*string*-equal starts-with-str *start-pos* (+ *start-pos* starts-with-len) 0 starts-with-len))) ;; the regular expression has a ;; case-insensitive constant start string ;; and we didn't find it (return-from scan nil)) ((and (not (case-insensitive-p starts-with)) (not (*string*= starts-with-str *start-pos* (+ *start-pos* starts-with-len) 0 starts-with-len))) ;; the regular expression has a ;; case-sensitive constant start string ;; and we didn't find it (return-from scan nil)) (t nil)))) (when (and end-string-test (not end-anchored-p)) ;; the regular expression has a constant end string ;; which isn't anchored so we didn't check for it ;; already (block end-string-loop ;; we temporarily use *END-STRING-POS* as our ;; starting position to look for end string ;; candidates (setq *end-string-pos* *start-pos*) (loop (unless (setq *end-string-pos* (funcall (the function end-string-test) *end-string-pos*)) ;; no end string candidate found, so give up (return-from scan nil)) (unless end-string-offset ;; end string doesn't have an offset so we ;; can start scanning now (return-from end-string-loop)) (let ((maybe-start-pos (- (the fixnum *end-string-pos*) (the fixnum end-string-offset)))) (cond ((= maybe-start-pos *start-pos*) ;; offset of end string into regular ;; expression matches start anchor - ;; fine... (return-from end-string-loop)) ((and (< maybe-start-pos *start-pos*) (< (+ *end-string-pos* end-string-len) *end-pos*)) ;; no match but maybe we find another ;; one to the right - try again (incf *end-string-pos*)) (t ;; otherwise give up (return-from scan nil))))))) ;; if we got here we scan exactly once (let ((next-pos (funcall match-fn *start-pos*))) (when next-pos (values (if next-pos *start-pos* nil) next-pos *reg-starts* *reg-ends*)))) (t (loop for pos = (if starts-with-everything ;; don't jump to the next ;; #\Newline on the first ;; iteration scan-start-pos (advance-fn scan-start-pos)) then (advance-fn pos) ;; give up if the regular expression can't fit ;; into the rest of the target string while (and pos (<= (the fixnum pos) max-end-pos)) do (let ((next-pos (funcall match-fn pos))) (when next-pos (return-from scan (values pos next-pos *reg-starts* *reg-ends*))) ;; not yet found, increment POS #-cormanlisp (incf (the fixnum pos)) #+cormanlisp (incf pos))))))))) :test #'equalp)) (defun create-scanner-aux (match-fn min-len start-anchored-p starts-with start-string-test end-anchored-p end-string-test end-string-len end-string-offset rep-num zero-length-num reg-num) "Auxiliary function to create and return a scanner \(which is actually a closure). Used by CREATE-SCANNER." (declare #.*standard-optimize-settings*) (declare (fixnum min-len zero-length-num rep-num reg-num)) (let ((starts-with-len (if (typep starts-with 'str) (len starts-with))) (starts-with-everything (typep starts-with 'everything))) (cond ;; this COND statement dispatches on the different versions we ;; have for ADVANCE-FN and creates different closures for each; ;; note that you see only the bodies of ADVANCE-FN below - the ;; actual scanner is defined in INSERT-ADVANCE-FN above; (we ;; could have done this with closures instead of macrology but ;; would have consed a lot more) ((and start-string-test end-string-test end-string-offset) ;; we know that the regular expression has constant start and ;; end strings and we know the end string's offset (from the ;; left) (insert-advance-fn (advance-fn (pos) (declare (fixnum end-string-offset starts-with-len) (function start-string-test end-string-test)) (loop (unless (setq pos (funcall start-string-test pos)) ;; give up completely if we can't find a start string ;; candidate (return-from scan nil)) (locally ;; from here we know that POS is a FIXNUM (declare (fixnum pos)) (when (= pos (- (the fixnum *end-string-pos*) end-string-offset)) ;; if we already found an end string candidate the ;; position of which matches the start string ;; candidate we're done (return-from advance-fn pos)) (let ((try-pos (+ pos starts-with-len))) ;; otherwise try (again) to find an end string ;; candidate which starts behind the start string ;; candidate (loop (unless (setq *end-string-pos* (funcall end-string-test try-pos)) ;; no end string candidate found, so give up (return-from scan nil)) ;; NEW-POS is where we should start scanning ;; according to the end string candidate (let ((new-pos (- (the fixnum *end-string-pos*) end-string-offset))) (declare (fixnum new-pos *end-string-pos*)) (cond ((= new-pos pos) ;; if POS and NEW-POS are equal then the ;; two candidates agree so we're fine (return-from advance-fn pos)) ((> new-pos pos) ;; if NEW-POS is further to the right we ;; advance POS and try again, i.e. we go ;; back to the start of the outer LOOP (setq pos new-pos) ;; this means "return from inner LOOP" (return)) (t ;; otherwise NEW-POS is smaller than POS, ;; so we have to redo the inner LOOP to ;; find another end string candidate ;; further to the right (setq try-pos (1+ *end-string-pos*)))))))))))) ((and starts-with-everything end-string-test end-string-offset) ;; we know that the regular expression starts with ".*" (which ;; is not in single-line-mode, see CREATE-SCANNER-AUX) and ends ;; with a constant end string and we know the end string's ;; offset (from the left) (insert-advance-fn (advance-fn (pos) (declare (fixnum end-string-offset) (function end-string-test)) (loop (unless (setq pos (newline-skipper pos)) ;; if we can't find a #\Newline we give up immediately (return-from scan nil)) (locally ;; from here we know that POS is a FIXNUM (declare (fixnum pos)) (when (= pos (- (the fixnum *end-string-pos*) end-string-offset)) ;; if we already found an end string candidate the ;; position of which matches the place behind the ;; #\Newline we're done (return-from advance-fn pos)) (let ((try-pos pos)) ;; otherwise try (again) to find an end string ;; candidate which starts behind the #\Newline (loop (unless (setq *end-string-pos* (funcall end-string-test try-pos)) ;; no end string candidate found, so we give up (return-from scan nil)) ;; NEW-POS is where we should start scanning ;; according to the end string candidate (let ((new-pos (- (the fixnum *end-string-pos*) end-string-offset))) (declare (fixnum new-pos *end-string-pos*)) (cond ((= new-pos pos) ;; if POS and NEW-POS are equal then the ;; the end string candidate agrees with ;; the #\Newline so we're fine (return-from advance-fn pos)) ((> new-pos pos) ;; if NEW-POS is further to the right we ;; advance POS and try again, i.e. we go ;; back to the start of the outer LOOP (setq pos new-pos) ;; this means "return from inner LOOP" (return)) (t ;; otherwise NEW-POS is smaller than POS, ;; so we have to redo the inner LOOP to ;; find another end string candidate ;; further to the right (setq try-pos (1+ *end-string-pos*)))))))))))) ((and start-string-test end-string-test) ;; we know that the regular expression has constant start and ;; end strings; similar to the first case but we only need to ;; check for the end string, it doesn't provide enough ;; information to advance POS (insert-advance-fn (advance-fn (pos) (declare (function start-string-test end-string-test)) (unless (setq pos (funcall start-string-test pos)) (return-from scan nil)) (if (<= (the fixnum pos) (the fixnum *end-string-pos*)) (return-from advance-fn pos)) (unless (setq *end-string-pos* (funcall end-string-test pos)) (return-from scan nil)) pos))) ((and starts-with-everything end-string-test) ;; we know that the regular expression starts with ".*" (which ;; is not in single-line-mode, see CREATE-SCANNER-AUX) and ends ;; with a constant end string; similar to the second case but we ;; only need to check for the end string, it doesn't provide ;; enough information to advance POS (insert-advance-fn (advance-fn (pos) (declare (function end-string-test)) (unless (setq pos (newline-skipper pos)) (return-from scan nil)) (if (<= (the fixnum pos) (the fixnum *end-string-pos*)) (return-from advance-fn pos)) (unless (setq *end-string-pos* (funcall end-string-test pos)) (return-from scan nil)) pos))) (start-string-test ;; just check for constant start string candidate (insert-advance-fn (advance-fn (pos) (declare (function start-string-test)) (unless (setq pos (funcall start-string-test pos)) (return-from scan nil)) pos))) (starts-with-everything ;; just advance POS with NEWLINE-SKIPPER (insert-advance-fn (advance-fn (pos) (unless (setq pos (newline-skipper pos)) (return-from scan nil)) pos))) (end-string-test ;; just check for the next end string candidate if POS has ;; advanced beyond the last one (insert-advance-fn (advance-fn (pos) (declare (function end-string-test)) (if (<= (the fixnum pos) (the fixnum *end-string-pos*)) (return-from advance-fn pos)) (unless (setq *end-string-pos* (funcall end-string-test pos)) (return-from scan nil)) pos))) (t ;; not enough optimization information about the regular ;; expression to optimize so we just return POS (insert-advance-fn (advance-fn (pos) pos))))))