Skip to content
convert.lisp 39.7 KiB
Newer Older
Hans Huebner's avatar
Hans Huebner committed
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
Edi Weitz's avatar
Edi Weitz committed
;;; $Header: /usr/local/cvsrep/cl-ppcre/convert.lisp,v 1.57 2009/09/17 19:17:31 edi Exp $
Hans Huebner's avatar
Hans Huebner committed

;;; Here the parse tree is converted into its internal representation
;;; using REGEX objects.  At the same time some optimizations are
;;; already applied.

Edi Weitz's avatar
Edi Weitz committed
;;; Copyright (c) 2002-2009, Dr. Edmund Weitz. All rights reserved.
Hans Huebner's avatar
Hans Huebner committed

;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
;;; are met:

;;;   * Redistributions of source code must retain the above copyright
;;;     notice, this list of conditions and the following disclaimer.

;;;   * Redistributions in binary form must reproduce the above
;;;     copyright notice, this list of conditions and the following
;;;     disclaimer in the documentation and/or other materials
;;;     provided with the distribution.

;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Edi Weitz's avatar
Edi Weitz committed
(in-package :cl-ppcre)
Hans Huebner's avatar
Hans Huebner committed

;;; The flags that represent the "ism" modifiers are always kept
;;; together in a three-element list. We use the following macros to
;;; access individual elements.

(defmacro case-insensitive-mode-p (flags)
  "Accessor macro to extract the first flag out of a three-element flag list."
  `(first ,flags))

(defmacro multi-line-mode-p (flags)
  "Accessor macro to extract the second flag out of a three-element flag list."
  `(second ,flags))

(defmacro single-line-mode-p (flags)
  "Accessor macro to extract the third flag out of a three-element flag list."
  `(third ,flags))

(defun set-flag (token)
  "Reads a flag token and sets or unsets the corresponding entry in
the special FLAGS list."
Edi Weitz's avatar
Edi Weitz committed
  (declare #.*standard-optimize-settings*)
  (declare (special flags))
Hans Huebner's avatar
Hans Huebner committed
  (case token
    ((:case-insensitive-p)
      (setf (case-insensitive-mode-p flags) t))
    ((:case-sensitive-p)
      (setf (case-insensitive-mode-p flags) nil))
    ((:multi-line-mode-p)
      (setf (multi-line-mode-p flags) t))
    ((:not-multi-line-mode-p)
      (setf (multi-line-mode-p flags) nil))
    ((:single-line-mode-p)
      (setf (single-line-mode-p flags) t))
    ((:not-single-line-mode-p)
      (setf (single-line-mode-p flags) nil))
    (otherwise
Edi Weitz's avatar
Edi Weitz committed
      (signal-syntax-error "Unknown flag token ~A." token))))

(defgeneric resolve-property (property)
  (:documentation "Resolves PROPERTY to a unary character test
function.  PROPERTY can either be a function designator or it can be a
string which is resolved using *PROPERTY-RESOLVER*.")
  (:method ((property-name string))
   (funcall *property-resolver* property-name))
  (:method ((function-name symbol))
   function-name)
  (:method ((test-function function))
   test-function))
Hans Huebner's avatar
Hans Huebner committed

Edi Weitz's avatar
Edi Weitz committed
(defun convert-char-class-to-test-function (list invertedp case-insensitive-p)
  "Combines all items in LIST into test function and returns a
logical-OR combination of these functions.  Items can be single
characters, character ranges like \(:RANGE #\\A #\\E), or special
character classes like :DIGIT-CLASS.  Does the right thing with
respect to case-\(in)sensitivity as specified by the special variable
FLAGS."
  (declare #.*standard-optimize-settings*)
Hans Huebner's avatar
Hans Huebner committed
  (declare (special flags))
Edi Weitz's avatar
Edi Weitz committed
  (let ((test-functions
         (loop for item in list
               collect (cond ((characterp item)
                              ;; rebind so closure captures the right one
                              (let ((this-char item))
                                (lambda (char)
                                  (declare (character char this-char))
                                  (char= char this-char))))
                             ((symbolp item)
                              (case item
                                ((:digit-class) #'digit-char-p)
                                ((:non-digit-class) (complement* #'digit-char-p))
                                ((:whitespace-char-class) #'whitespacep)
                                ((:non-whitespace-char-class) (complement* #'whitespacep))
                                ((:word-char-class) #'word-char-p)
                                ((:non-word-char-class) (complement* #'word-char-p))
                                (otherwise
                                 (signal-syntax-error "Unknown symbol ~A in character class." item))))
                             ((and (consp item)
                                   (eq (first item) :property))
                              (resolve-property (second item)))
                             ((and (consp item)
                                   (eq (first item) :inverted-property))
                              (complement* (resolve-property (second item))))
                             ((and (consp item)
                                   (eq (first item) :range))
                              (let ((from (second item))
                                    (to (third item)))
                                (when (char> from to)
                                  (signal-syntax-error "Invalid range from ~S to ~S in char-class." from to))
                                (lambda (char)
                                  (declare (character char from to))
                                  (char<= from char to))))
                             (t (signal-syntax-error "Unknown item ~A in char-class list." item))))))
    (unless test-functions
      (signal-syntax-error "Empty character class."))
    (cond ((cdr test-functions)           
           (cond ((and invertedp case-insensitive-p)
                  (lambda (char)
                    (declare (character char))
                    (loop with both-case-p = (both-case-p char)
                          with char-down = (if both-case-p (char-downcase char) char)
                          with char-up = (if both-case-p (char-upcase char) nil)
                          for test-function in test-functions
                          never (or (funcall test-function char-down)
                                    (and char-up (funcall test-function char-up))))))
                 (case-insensitive-p
                  (lambda (char)
                    (declare (character char))
                    (loop with both-case-p = (both-case-p char)
                          with char-down = (if both-case-p (char-downcase char) char)
                          with char-up = (if both-case-p (char-upcase char) nil)
                          for test-function in test-functions
                          thereis (or (funcall test-function char-down)
                                      (and char-up (funcall test-function char-up))))))
                 (invertedp
                  (lambda (char)
                    (loop for test-function in test-functions
                          never (funcall test-function char))))
                 (t
                  (lambda (char)
                    (loop for test-function in test-functions
                          thereis (funcall test-function char))))))
          ;; there's only one test-function
          (t (let ((test-function (first test-functions)))
               (cond ((and invertedp case-insensitive-p)
                      (lambda (char)
                        (declare (character char))
                        (not (or (funcall test-function (char-downcase char))
                                 (and (both-case-p char)
                                      (funcall test-function (char-upcase char)))))))
                     (case-insensitive-p
                      (lambda (char)
                        (declare (character char))
                        (or (funcall test-function (char-downcase char))
                            (and (both-case-p char)
                                 (funcall test-function (char-upcase char))))))
                     (invertedp (complement* test-function))
                     (t test-function)))))))
Hans Huebner's avatar
Hans Huebner committed

(defun maybe-split-repetition (regex
                               greedyp
                               minimum
                               maximum
                               min-len
                               length
                               reg-seen)
  "Splits a REPETITION object into a constant and a varying part if
applicable, i.e. something like
  a{3,} -> a{3}a*
The arguments to this function correspond to the REPETITION slots of
the same name."
Edi Weitz's avatar
Edi Weitz committed
  (declare #.*standard-optimize-settings*)
  (declare (fixnum minimum)
Edi Weitz's avatar
Edi Weitz committed
           (type (or fixnum null) maximum))
Hans Huebner's avatar
Hans Huebner committed
  ;; note the usage of COPY-REGEX here; we can't use the same REGEX
  ;; object in both REPETITIONS because they will have different
  ;; offsets
  (when maximum
    (when (zerop maximum)
      ;; trivial case: don't repeat at all
      (return-from maybe-split-repetition
        (make-instance 'void)))
    (when (= 1 minimum maximum)
      ;; another trivial case: "repeat" exactly once
      (return-from maybe-split-repetition
        regex)))
  ;; first set up the constant part of the repetition
  ;; maybe that's all we need
  (let ((constant-repetition (if (plusp minimum)
                               (make-instance 'repetition
                                              :regex (copy-regex regex)
                                              :greedyp greedyp
                                              :minimum minimum
                                              :maximum minimum
                                              :min-len min-len
                                              :len length
                                              :contains-register-p reg-seen)
                               ;; don't create garbage if minimum is 0
                               nil)))
    (when (and maximum
               (= maximum minimum))
      (return-from maybe-split-repetition
        ;; no varying part needed because min = max
        constant-repetition))
    ;; now construct the varying part
    (let ((varying-repetition
            (make-instance 'repetition
                           :regex regex
                           :greedyp greedyp
                           :minimum 0
                           :maximum (if maximum (- maximum minimum) nil)
                           :min-len min-len
                           :len length
                           :contains-register-p reg-seen)))
      (cond ((zerop minimum)
              ;; min = 0, no constant part needed
              varying-repetition)
            ((= 1 minimum)
              ;; min = 1, constant part needs no REPETITION wrapped around
              (make-instance 'seq
                             :elements (list (copy-regex regex)
                                             varying-repetition)))
            (t
              ;; general case
              (make-instance 'seq
                             :elements (list constant-repetition
                                             varying-repetition)))))))

;; During the conversion of the parse tree we keep track of the start
;; of the parse tree in the special variable STARTS-WITH which'll
;; either hold a STR object or an EVERYTHING object. The latter is the
;; case if the regex starts with ".*" which implicitly anchors the
Hans Huebner's avatar
Hans Huebner committed
;; regex at the start (perhaps modulo #\Newline).

(defun maybe-accumulate (str)
Hans Huebner's avatar
Hans Huebner committed
  "Accumulate STR into the special variable STARTS-WITH if
ACCUMULATE-START-P (also special) is true and STARTS-WITH is either
NIL or a STR object of the same case mode. Always returns NIL."
Edi Weitz's avatar
Edi Weitz committed
  (declare #.*standard-optimize-settings*)
  (declare (special accumulate-start-p starts-with))
  (declare (ftype (function (t) fixnum) len))
Hans Huebner's avatar
Hans Huebner committed
  (when accumulate-start-p
    (etypecase starts-with
      (str
        ;; STARTS-WITH already holds a STR, so we check if we can
        ;; concatenate
        (cond ((eq (case-insensitive-p starts-with)
                   (case-insensitive-p str))
                ;; we modify STARTS-WITH in place
                (setf (len starts-with)
                        (+ (len starts-with) (len str)))
                ;; note that we use SLOT-VALUE because the accessor
                ;; STR has a declared FTYPE which doesn't fit here
                (adjust-array (slot-value starts-with 'str)
                              (len starts-with)
                              :fill-pointer t)
                (setf (subseq (slot-value starts-with 'str)
                              (- (len starts-with) (len str)))
                        (str str)
                      ;; STR objects that are parts of STARTS-WITH
                      ;; always have their SKIP slot set to true
                      ;; because the SCAN function will take care of
                      ;; them, i.e. the matcher can ignore them
                      (skip str) t))
              (t (setq accumulate-start-p nil))))
      (null
        ;; STARTS-WITH is still empty, so we create a new STR object
        (setf starts-with
                (make-instance 'str
                               :str ""
                               :case-insensitive-p (case-insensitive-p str))
              ;; INITIALIZE-INSTANCE will coerce the STR to a simple
              ;; string, so we have to fill it afterwards
              (slot-value starts-with 'str)
                (make-array (len str)
                            :initial-contents (str str)
                            :element-type 'character
                            :fill-pointer t
                            :adjustable t)
              (len starts-with)
                (len str)
              ;; see remark about SKIP above
              (skip str) t))
      (everything
        ;; STARTS-WITH already holds an EVERYTHING object - we can't
        ;; concatenate
        (setq accumulate-start-p nil))))
  nil)

Edi Weitz's avatar
Edi Weitz committed
(declaim (inline convert-aux))
Hans Huebner's avatar
Hans Huebner committed
(defun convert-aux (parse-tree)
Edi Weitz's avatar
Edi Weitz committed
  "Converts the parse tree PARSE-TREE into a REGEX object and returns
it.  Will also
Hans Huebner's avatar
Hans Huebner committed

  - split and optimize repetitions,
  - accumulate strings or EVERYTHING objects into the special variable
    STARTS-WITH,
  - keep track of all registers seen in the special variable REG-NUM,
  - keep track of all named registers seen in the special variable REG-NAMES
Hans Huebner's avatar
Hans Huebner committed
  - keep track of the highest backreference seen in the special
    variable MAX-BACK-REF,
  - maintain and adher to the currently applicable modifiers in the special
    variable FLAGS, and
  - maybe even wash your car..."
Edi Weitz's avatar
Edi Weitz committed
  (declare #.*standard-optimize-settings*)
  (if (consp parse-tree)
    (convert-compound-parse-tree (first parse-tree) parse-tree)
    (convert-simple-parse-tree parse-tree)))
Hans Huebner's avatar
Hans Huebner committed

Edi Weitz's avatar
Edi Weitz committed
(defgeneric convert-compound-parse-tree (token parse-tree &key)
  (declare #.*standard-optimize-settings*)
  (:documentation "Helper function for CONVERT-AUX which converts
parse trees which are conses and dispatches on TOKEN which is the
first element of the parse tree.")
Edi Weitz's avatar
Edi Weitz committed
  (:method ((token t) parse-tree &key)
Edi Weitz's avatar
Edi Weitz committed
   (signal-syntax-error "Unknown token ~A in parse-tree." token)))

(defmethod convert-compound-parse-tree ((token (eql :sequence)) parse-tree &key)
  "The case for parse trees like \(:SEQUENCE {<regex>}*)."
  (declare #.*standard-optimize-settings*)
  (cond ((cddr parse-tree)
         ;; this is essentially like
         ;; (MAPCAR 'CONVERT-AUX (REST PARSE-TREE))
         ;; but we don't cons a new list
         (loop for parse-tree-rest on (rest parse-tree)
               while parse-tree-rest
               do (setf (car parse-tree-rest)
                        (convert-aux (car parse-tree-rest))))
         (make-instance 'seq :elements (rest parse-tree)))
        (t (convert-aux (second parse-tree)))))

(defmethod convert-compound-parse-tree ((token (eql :group)) parse-tree &key)
  "The case for parse trees like \(:GROUP {<regex>}*).

This is a syntactical construct equivalent to :SEQUENCE intended to
keep the effect of modifiers local."
  (declare #.*standard-optimize-settings*)
  (declare (special flags))
  ;; make a local copy of FLAGS and shadow the global value while we
  ;; descend into the enclosed regexes
  (let ((flags (copy-list flags)))
    (declare (special flags))
    (cond ((cddr parse-tree)
           (loop for parse-tree-rest on (rest parse-tree)
                 while parse-tree-rest
                 do (setf (car parse-tree-rest)
                          (convert-aux (car parse-tree-rest))))
           (make-instance 'seq :elements (rest parse-tree)))
          (t (convert-aux (second parse-tree))))))

(defmethod convert-compound-parse-tree ((token (eql :alternation)) parse-tree &key)
  "The case for \(:ALTERNATION {<regex>}*)."
  (declare #.*standard-optimize-settings*)
  (declare (special accumulate-start-p))
  ;; we must stop accumulating objects into STARTS-WITH once we reach
  ;; an alternation
  (setq accumulate-start-p nil)
  (loop for parse-tree-rest on (rest parse-tree)
        while parse-tree-rest
        do (setf (car parse-tree-rest)
                 (convert-aux (car parse-tree-rest))))
  (make-instance 'alternation :choices (rest parse-tree)))

(defmethod convert-compound-parse-tree ((token (eql :branch)) parse-tree &key)
  "The case for \(:BRANCH <test> <regex>).

Here, <test> must be look-ahead, look-behind or number; if <regex> is
an alternation it must have one or two choices."
  (declare #.*standard-optimize-settings*)
  (declare (special accumulate-start-p))
  (setq accumulate-start-p nil)
  (let* ((test-candidate (second parse-tree))
         (test (cond ((numberp test-candidate)
                      (when (zerop (the fixnum test-candidate))
                        (signal-syntax-error "Register 0 doesn't exist: ~S." parse-tree))
                      (1- (the fixnum test-candidate)))
                     (t (convert-aux test-candidate))))
         (alternations (convert-aux (third parse-tree))))
    (when (and (not (numberp test))
               (not (typep test 'lookahead))
               (not (typep test 'lookbehind)))
      (signal-syntax-error "Branch test must be look-ahead, look-behind or number: ~S." parse-tree))
    (typecase alternations
      (alternation
       (case (length (choices alternations))
         ((0)
          (signal-syntax-error "No choices in branch: ~S." parse-tree))
         ((1)
          (make-instance 'branch
                         :test test
                         :then-regex (first
                                      (choices alternations))))
         ((2)
          (make-instance 'branch
                         :test test
                         :then-regex (first
                                      (choices alternations))
                         :else-regex (second
                                      (choices alternations))))
         (otherwise
          (signal-syntax-error "Too much choices in branch: ~S." parse-tree))))
      (t
       (make-instance 'branch
                      :test test
                      :then-regex alternations)))))

(defmethod convert-compound-parse-tree ((token (eql :positive-lookahead)) parse-tree &key)
  "The case for \(:POSITIVE-LOOKAHEAD <regex>)."
  (declare #.*standard-optimize-settings*)
  (declare (special flags accumulate-start-p))
  ;; keep the effect of modifiers local to the enclosed regex and stop
  ;; accumulating into STARTS-WITH
  (setq accumulate-start-p nil)
  (let ((flags (copy-list flags)))
    (declare (special flags))
    (make-instance 'lookahead
                   :regex (convert-aux (second parse-tree))
                   :positivep t)))

(defmethod convert-compound-parse-tree ((token (eql :negative-lookahead)) parse-tree &key)
  "The case for \(:NEGATIVE-LOOKAHEAD <regex>)."
  (declare #.*standard-optimize-settings*)
  ;; do the same as for positive look-aheads and just switch afterwards
  (let ((regex (convert-compound-parse-tree :positive-lookahead parse-tree)))
    (setf (slot-value regex 'positivep) nil)
    regex))

(defmethod convert-compound-parse-tree ((token (eql :positive-lookbehind)) parse-tree &key)
  "The case for \(:POSITIVE-LOOKBEHIND <regex>)."
  (declare #.*standard-optimize-settings*)
  (declare (special flags accumulate-start-p))
  ;; keep the effect of modifiers local to the enclosed regex and stop
  ;; accumulating into STARTS-WITH
  (setq accumulate-start-p nil)
  (let* ((flags (copy-list flags))
         (regex (convert-aux (second parse-tree)))
         (len (regex-length regex)))
    (declare (special flags))
    ;; lookbehind assertions must be of fixed length
    (unless len
      (signal-syntax-error "Variable length look-behind not implemented \(yet): ~S." parse-tree))
    (make-instance 'lookbehind
                   :regex regex
                   :positivep t
                   :len len)))

(defmethod convert-compound-parse-tree ((token (eql :negative-lookbehind)) parse-tree &key)
  "The case for \(:NEGATIVE-LOOKBEHIND <regex>)."
  (declare #.*standard-optimize-settings*)
  ;; do the same as for positive look-behinds and just switch afterwards
  (let ((regex (convert-compound-parse-tree :positive-lookbehind parse-tree)))
    (setf (slot-value regex 'positivep) nil)
    regex))

(defmethod convert-compound-parse-tree ((token (eql :greedy-repetition)) parse-tree &key (greedyp t))
  "The case for \(:GREEDY-REPETITION|:NON-GREEDY-REPETITION <min> <max> <regex>).

This function is also used for the non-greedy case in which case it is
called with GREEDYP set to NIL as you would expect."
  (declare #.*standard-optimize-settings*)
Edi Weitz's avatar
Edi Weitz committed
  (declare (special accumulate-start-p starts-with))
  ;; remember the value of ACCUMULATE-START-P upon entering
  (let ((local-accumulate-start-p accumulate-start-p))
    (let ((minimum (second parse-tree))
          (maximum (third parse-tree)))
      (declare (fixnum minimum))
      (declare (type (or null fixnum) maximum))
      (unless (and maximum
                   (= 1 minimum maximum))
        ;; set ACCUMULATE-START-P to NIL for the rest of
        ;; the conversion because we can't continue to
        ;; accumulate inside as well as after a proper
        ;; repetition
        (setq accumulate-start-p nil))
      (let* (reg-seen
             (regex (convert-aux (fourth parse-tree)))
             (min-len (regex-min-length regex))
             (length (regex-length regex)))
        ;; note that this declaration already applies to
        ;; the call to CONVERT-AUX above
        (declare (special reg-seen))
        (when (and local-accumulate-start-p
                   (not starts-with)
                   (zerop minimum)
                   (not maximum))
          ;; if this repetition is (equivalent to) ".*"
          ;; and if we're at the start of the regex we
          ;; remember it for ADVANCE-FN (see the SCAN
          ;; function)
          (setq starts-with (everythingp regex)))
        (if (or (not reg-seen)
                (not greedyp)
                (not length)
                (zerop length)
                (and maximum (= minimum maximum)))
          ;; the repetition doesn't enclose a register, or
          ;; it's not greedy, or we can't determine it's
          ;; (inner) length, or the length is zero, or the
          ;; number of repetitions is fixed; in all of
          ;; these cases we don't bother to optimize
          (maybe-split-repetition regex
                                  greedyp
                                  minimum
                                  maximum
                                  min-len
                                  length
                                  reg-seen)
          ;; otherwise we make a transformation that looks
          ;; roughly like one of
          ;;   <regex>* -> (?:<regex'>*<regex>)?
          ;;   <regex>+ -> <regex'>*<regex>
          ;; where the trick is that as much as possible
          ;; registers from <regex> are removed in
          ;; <regex'>
          (let* (reg-seen ; new instance for REMOVE-REGISTERS
                 (remove-registers-p t)
                 (inner-regex (remove-registers regex))
                 (inner-repetition
                  ;; this is the "<regex'>" part
                  (maybe-split-repetition inner-regex
                                          ;; always greedy
                                          t
                                          ;; reduce minimum by 1
                                          ;; unless it's already 0
                                          (if (zerop minimum)
                                            0
                                            (1- minimum))
                                          ;; reduce maximum by 1
                                          ;; unless it's NIL
                                          (and maximum
                                               (1- maximum))
                                          min-len
                                          length
                                          reg-seen))
                 (inner-seq
                  ;; this is the "<regex'>*<regex>" part
                  (make-instance 'seq
                                 :elements (list inner-repetition
                                                 regex))))
            ;; note that this declaration already applies
            ;; to the call to REMOVE-REGISTERS above
            (declare (special remove-registers-p reg-seen))
            ;; wrap INNER-SEQ with a greedy
            ;; {0,1}-repetition (i.e. "?") if necessary
            (if (plusp minimum)
              inner-seq
              (maybe-split-repetition inner-seq
                                      t
                                      0
                                      1
                                      min-len
                                      nil
                                      t))))))))

(defmethod convert-compound-parse-tree ((token (eql :non-greedy-repetition)) parse-tree &key)
  "The case for \(:NON-GREEDY-REPETITION <min> <max> <regex>)."
  (declare #.*standard-optimize-settings*)
  ;; just dispatch to the method above with GREEDYP explicitly set to NIL
  (convert-compound-parse-tree :greedy-repetition parse-tree :greedyp nil))

(defmethod convert-compound-parse-tree ((token (eql :register)) parse-tree &key name)
  "The case for \(:REGISTER <regex>).  Also used for named registers
when NAME is not NIL."
  (declare #.*standard-optimize-settings*)
  (declare (special flags reg-num reg-names))
  ;; keep the effect of modifiers local to the enclosed regex; also,
  ;; assign the current value of REG-NUM to the corresponding slot of
  ;; the REGISTER object and increase this counter afterwards; for
  ;; named register update REG-NAMES and set the corresponding name
  ;; slot of the REGISTER object too
  (let ((flags (copy-list flags))
        (stored-reg-num reg-num))
    (declare (special flags reg-seen named-reg-seen))
    (setq reg-seen t)
    (when name (setq named-reg-seen t))
    (incf (the fixnum reg-num))
    (push name reg-names)
    (make-instance 'register
                   :regex (convert-aux (if name (third parse-tree) (second parse-tree)))
                   :num stored-reg-num
                   :name name)))

(defmethod convert-compound-parse-tree ((token (eql :named-register)) parse-tree &key)
  "The case for \(:NAMED-REGISTER <regex>)."
  (declare #.*standard-optimize-settings*)
  ;; call the method above and use the :NAME keyword argument
  (convert-compound-parse-tree :register parse-tree :name (copy-seq (second parse-tree))))

(defmethod convert-compound-parse-tree ((token (eql :filter)) parse-tree &key)
  "The case for \(:FILTER <function> &optional <length>)."
  (declare #.*standard-optimize-settings*)
  (declare (special accumulate-start-p))
  ;; stop accumulating into STARTS-WITH
  (setq accumulate-start-p nil)
  (make-instance 'filter
                 :fn (second parse-tree)
                 :len (third parse-tree)))

(defmethod convert-compound-parse-tree ((token (eql :standalone)) parse-tree &key)
  "The case for \(:STANDALONE <regex>)."
  (declare #.*standard-optimize-settings*)
  (declare (special flags accumulate-start-p))
  ;; stop accumulating into STARTS-WITH
  (setq accumulate-start-p nil)
  ;; keep the effect of modifiers local to the enclosed regex
  (let ((flags (copy-list flags)))
    (declare (special flags))
    (make-instance 'standalone :regex (convert-aux (second parse-tree)))))

(defmethod convert-compound-parse-tree ((token (eql :back-reference)) parse-tree &key)
  "The case for \(:BACK-REFERENCE <number>|<name>)."
  (declare #.*standard-optimize-settings*)
  (declare (special flags accumulate-start-p reg-num reg-names max-back-ref))
  (let* ((backref-name (and (stringp (second parse-tree))
                            (second parse-tree)))
         (referred-regs
          (when backref-name
            ;; find which register corresponds to the given name
            ;; we have to deal with case where several registers share
            ;; the same name and collect their respective numbers
            (loop for name in reg-names
                  for reg-index from 0
                  when (string= name backref-name)
                  ;; NOTE: REG-NAMES stores register names in reversed
                  ;; order REG-NUM contains number of (any) registers
                  ;; seen so far; 1- will be done later
                  collect (- reg-num reg-index))))
         ;; store the register number for the simple case
         (backref-number (or (first referred-regs) (second parse-tree))))
    (declare (type (or fixnum null) backref-number))
    (when (or (not (typep backref-number 'fixnum))
              (<= backref-number 0))
      (signal-syntax-error "Illegal back-reference: ~S." parse-tree))
    ;; stop accumulating into STARTS-WITH and increase MAX-BACK-REF if
    ;; necessary
    (setq accumulate-start-p nil
          max-back-ref (max (the fixnum max-back-ref)
                            backref-number))
    (flet ((make-back-ref (backref-number)
             (make-instance 'back-reference
                            ;; we start counting from 0 internally
                            :num (1- backref-number)
                            :case-insensitive-p (case-insensitive-mode-p flags)
                            ;; backref-name is NIL or string, safe to copy
                            :name (copy-seq backref-name))))
      (cond
       ((cdr referred-regs)
        ;; several registers share the same name we will try to match
        ;; any of them, starting with the most recent first
        ;; alternation is used to accomplish matching
        (make-instance 'alternation
                       :choices (loop
                                 for reg-index in referred-regs
                                 collect (make-back-ref reg-index))))
       ;; simple case - backref corresponds to only one register
       (t
        (make-back-ref backref-number))))))

(defmethod convert-compound-parse-tree ((token (eql :regex)) parse-tree &key)
  "The case for \(:REGEX <string>)."
  (declare #.*standard-optimize-settings*)
  (convert-aux (parse-string (second parse-tree))))

(defmethod convert-compound-parse-tree ((token (eql :char-class)) parse-tree &key invertedp)
  "The case for \(:CHAR-CLASS {<item>}*) where item is one of

- a character,
- a character range: \(:RANGE <char1> <char2>), or
- a special char class symbol like :DIGIT-CHAR-CLASS.

Also used for inverted char classes when INVERTEDP is true."
  (declare #.*standard-optimize-settings*)
  (declare (special flags accumulate-start-p))
  (let ((test-function
         (create-optimized-test-function
          (convert-char-class-to-test-function (rest parse-tree)
                                               invertedp
                                               (case-insensitive-mode-p flags)))))
    (setq accumulate-start-p nil)
    (make-instance 'char-class :test-function test-function)))

(defmethod convert-compound-parse-tree ((token (eql :inverted-char-class)) parse-tree &key)
  "The case for \(:INVERTED-CHAR-CLASS {<item>}*)."
  (declare #.*standard-optimize-settings*)
  ;; just dispatch to the "real" method
  (convert-compound-parse-tree :char-class parse-tree :invertedp t))

(defmethod convert-compound-parse-tree ((token (eql :property)) parse-tree &key)
  "The case for \(:PROPERTY <name>) where <name> is a string."
  (declare #.*standard-optimize-settings*)
  (make-instance 'char-class :test-function (resolve-property (second parse-tree))))

(defmethod convert-compound-parse-tree ((token (eql :inverted-property)) parse-tree &key)
  "The case for \(:INVERTED-PROPERTY <name>) where <name> is a string."
  (declare #.*standard-optimize-settings*)
  (make-instance 'char-class :test-function (complement* (resolve-property (second parse-tree)))))

(defmethod convert-compound-parse-tree ((token (eql :flags)) parse-tree &key)
  "The case for \(:FLAGS {<flag>}*) where flag is a modifier symbol
like :CASE-INSENSITIVE-P."
  (declare #.*standard-optimize-settings*)
  ;; set/unset the flags corresponding to the symbols
  ;; following :FLAGS
  (mapc #'set-flag (rest parse-tree))
  ;; we're only interested in the side effect of
  ;; setting/unsetting the flags and turn this syntactical
  ;; construct into a VOID object which'll be optimized
  ;; away when creating the matcher
  (make-instance 'void))

(defgeneric convert-simple-parse-tree (parse-tree)
  (declare #.*standard-optimize-settings*)
  (:documentation "Helper function for CONVERT-AUX which converts
parse trees which are atoms.")
  (:method ((parse-tree (eql :void)))
   (declare #.*standard-optimize-settings*)
   (make-instance 'void))
  (:method ((parse-tree (eql :word-boundary)))
   (declare #.*standard-optimize-settings*)
   (make-instance 'word-boundary :negatedp nil))
  (:method ((parse-tree (eql :non-word-boundary)))
   (declare #.*standard-optimize-settings*)
   (make-instance 'word-boundary :negatedp t))
  (:method ((parse-tree (eql :everything)))
   (declare #.*standard-optimize-settings*)
   (declare (special flags accumulate-start-p))
   (setq accumulate-start-p nil)
   (make-instance 'everything :single-line-p (single-line-mode-p flags)))
  (:method ((parse-tree (eql :digit-class)))
   (declare #.*standard-optimize-settings*)
   (declare (special accumulate-start-p))
   (setq accumulate-start-p nil)
   (make-instance 'char-class :test-function #'digit-char-p))
  (:method ((parse-tree (eql :word-char-class)))
   (declare #.*standard-optimize-settings*)
   (declare (special accumulate-start-p))
   (setq accumulate-start-p nil)
   (make-instance 'char-class :test-function #'word-char-p))
  (:method ((parse-tree (eql :whitespace-char-class)))
   (declare #.*standard-optimize-settings*)
   (declare (special accumulate-start-p))
   (setq accumulate-start-p nil)
   (make-instance 'char-class :test-function #'whitespacep))
  (:method ((parse-tree (eql :non-digit-class)))
   (declare #.*standard-optimize-settings*)
   (declare (special accumulate-start-p))
   (setq accumulate-start-p nil)
   (make-instance 'char-class :test-function (complement* #'digit-char-p)))
  (:method ((parse-tree (eql :non-word-char-class)))
   (declare #.*standard-optimize-settings*)
   (declare (special accumulate-start-p))
   (setq accumulate-start-p nil)
   (make-instance 'char-class :test-function (complement* #'word-char-p)))
  (:method ((parse-tree (eql :non-whitespace-char-class)))
   (declare #.*standard-optimize-settings*)
   (declare (special accumulate-start-p))
   (setq accumulate-start-p nil)
   (make-instance 'char-class :test-function (complement* #'whitespacep)))
  (:method ((parse-tree (eql :start-anchor)))
   ;; Perl's "^"
   (declare #.*standard-optimize-settings*)
   (declare (special flags))
   (make-instance 'anchor :startp t :multi-line-p (multi-line-mode-p flags)))
  (:method ((parse-tree (eql :end-anchor)))
   ;; Perl's "$"
   (declare #.*standard-optimize-settings*)
   (declare (special flags))
   (make-instance 'anchor :startp nil :multi-line-p (multi-line-mode-p flags)))
  (:method ((parse-tree (eql :modeless-start-anchor)))
   ;; Perl's "\A"
   (declare #.*standard-optimize-settings*)
   (make-instance 'anchor :startp t))
  (:method ((parse-tree (eql :modeless-end-anchor)))
   ;; Perl's "$\Z"
   (declare #.*standard-optimize-settings*)
   (make-instance 'anchor :startp nil))
  (:method ((parse-tree (eql :modeless-end-anchor-no-newline)))
   ;; Perl's "$\z"
   (declare #.*standard-optimize-settings*)
   (make-instance 'anchor :startp nil :no-newline-p t))
  (:method ((parse-tree (eql :case-insensitive-p)))
   (declare #.*standard-optimize-settings*)
   (set-flag parse-tree)
   (make-instance 'void))
  (:method ((parse-tree (eql :case-sensitive-p)))
   (declare #.*standard-optimize-settings*)
   (set-flag parse-tree)
   (make-instance 'void))
  (:method ((parse-tree (eql :multi-line-mode-p)))
   (declare #.*standard-optimize-settings*)
   (set-flag parse-tree)
   (make-instance 'void))
  (:method ((parse-tree (eql :not-multi-line-mode-p)))
   (declare #.*standard-optimize-settings*)
   (set-flag parse-tree)
   (make-instance 'void))
  (:method ((parse-tree (eql :single-line-mode-p)))
   (declare #.*standard-optimize-settings*)
   (set-flag parse-tree)
   (make-instance 'void))
  (:method ((parse-tree (eql :not-single-line-mode-p)))
   (declare #.*standard-optimize-settings*)
   (set-flag parse-tree)
   (make-instance 'void)))

(defmethod convert-simple-parse-tree ((parse-tree string))
  (declare #.*standard-optimize-settings*)
  (declare (special flags))
  ;; turn strings into STR objects and try to accumulate into
  ;; STARTS-WITH
  (let ((str (make-instance 'str
                            :str parse-tree
                            :case-insensitive-p (case-insensitive-mode-p flags))))
    (maybe-accumulate str)
    str))

(defmethod convert-simple-parse-tree ((parse-tree character))
  (declare #.*standard-optimize-settings*)
  ;; dispatch to the method for strings
  (convert-simple-parse-tree (string parse-tree)))
        
(defmethod convert-simple-parse-tree (parse-tree)
  "The default method - check if there's a translation."
  (declare #.*standard-optimize-settings*)
  (let ((translation (and (symbolp parse-tree) (parse-tree-synonym parse-tree))))
    (if translation
      (convert-aux (copy-tree translation))
      (signal-syntax-error "Unknown token ~A in parse tree." parse-tree))))

(defun convert (parse-tree)
Hans Huebner's avatar
Hans Huebner committed
  "Converts the parse tree PARSE-TREE into an equivalent REGEX object
and returns three values: the REGEX object, the number of registers
seen and an object the regex starts with which is either a STR object
Edi Weitz's avatar
Edi Weitz committed
or an EVERYTHING object \(if the regex starts with something like
Hans Huebner's avatar
Hans Huebner committed
\".*\") or NIL."
Edi Weitz's avatar
Edi Weitz committed
  (declare #.*standard-optimize-settings*)
Hans Huebner's avatar
Hans Huebner committed
  ;; this function basically just initializes the special variables
  ;; and then calls CONVERT-AUX to do all the work
  (let* ((flags (list nil nil nil))
         (reg-num 0)
         reg-names
         named-reg-seen
Hans Huebner's avatar
Hans Huebner committed
         (accumulate-start-p t)
         starts-with
         (max-back-ref 0)
         (converted-parse-tree (convert-aux parse-tree)))
    (declare (special flags reg-num reg-names named-reg-seen
                      accumulate-start-p starts-with max-back-ref))
Hans Huebner's avatar
Hans Huebner committed
    ;; make sure we don't reference registers which aren't there
    (when (> (the fixnum max-back-ref)
             (the fixnum reg-num))
Edi Weitz's avatar
Edi Weitz committed
      (signal-syntax-error "Backreference to register ~A which has not been defined." max-back-ref))
Hans Huebner's avatar
Hans Huebner committed
    (when (typep starts-with 'str)
      (setf (slot-value starts-with 'str)
Edi Weitz's avatar
Edi Weitz committed
              (coerce (slot-value starts-with 'str)
                      #+:lispworks 'lw:simple-text-string
                      #-:lispworks 'simple-string)))
    (values converted-parse-tree reg-num starts-with
            ;; we can't simply use *ALLOW-NAMED-REGISTERS*
            ;; since parse-tree syntax ignores it
            (when named-reg-seen
              (nreverse reg-names)))))