CL-PPCRE - Portable Perl-compatible regular expressions for Common Lisp

(string string)`&key` case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive · regex

(asdf:oos 'asdf:test-op :cl-ppcre)

&key
&key
&key
&optional
* (parse-string "(ab)*")
(:GREEDY-REPETITION 0 NIL (:REGISTER "ab"))

* (parse-string "(a(b))")
(:REGISTER (:SEQUENCE #\a (:REGISTER #\b)))

* (parse-string "(?:abc){3,5}")
(:GREEDY-REPETITION 3 5 (:GROUP "abc"))
;; (:GREEDY-REPETITION 3 5 "abc") would also be OK

* (parse-string "a(?i)b(?-i)c")
(:SEQUENCE #\a
 (:SEQUENCE (:FLAGS :CASE-INSENSITIVE-P)
  (:SEQUENCE #\b (:SEQUENCE (:FLAGS :CASE-SENSITIVE-P) #\c))))
;; same as (:SEQUENCE #\a :CASE-INSENSITIVE-P #\b :CASE-SENSITIVE-P #\c)

* (parse-string "(?=a)b")
(:SEQUENCE (:POSITIVE-LOOKAHEAD #\a) #\b)

&key
* (scan "(a)*b" "xaaabd")
1
5
#(3)
#(4)

* (scan "(a)*b" "xaaabd" :start 1)
1
5
#(3)
#(4)

* (scan "(a)*b" "xaaabd" :start 2)
2
5
#(3)
#(4)

* (scan "(a)*b" "xaaabd" :end 4)
NIL

* (scan '(:greedy-repetition 0 nil #\b) "bbbc")
0
3
#()
#()

* (scan '(:greedy-repetition 4 6 #\b) "bbbc")
NIL

* (let ((s (create-scanner "(([a-c])+)x")))
    (scan s "abcxy"))
0
4
#(0 2)
#(3 3)

&key
* (scan-to-strings "[^b]*b" "aaabd")
"aaab"
#()

* (scan-to-strings "([^b])*b" "aaabd")
"aaab"
#("a")

* (scan-to-strings "(([^b])*)b" "aaabd")
"aaab"
#("aaa" "a")

&key
* (register-groups-bind (first second third fourth)
      ("((a)|(b)|(c))+" "abababc" :sharedp t)
    (list first second third fourth))
("c" "a" "b" "c")

* (register-groups-bind (nil second third fourth)
      ;; note that we don't bind the first and fifth register group
      ("((a)|(b)|(c))()+" "abababc" :start 6)
    (list second third fourth))
(NIL NIL "c")

* (register-groups-bind (first)
      ("(a|b)+" "accc" :start 1)
    (format t "This will not be printed: ~A" first))
NIL

* (register-groups-bind (fname lname (#'parse-integer date month year))
      ("(\\w+)\\s+(\\w+)\\s+(\\d{1,2})\\.(\\d{1,2})\\.(\\d{4})" "Frank Zappa 21.12.1940")
    (list fname lname (encode-universal-time 0 0 0 date month year 0)))
("Frank" "Zappa" 1292889600)

&optional
&key
&optional
&key
* (defun foo (regex target-string &key (start 0) (end (length target-string)))
    (let ((sum 0))
      (do-matches (s e regex target-string nil :start start :end end)
        (incf sum (- e s)))
      (format t "~,2F% of the string was inside of a match~%"
                ;; note: doesn't check for division by zero
                (float (* 100 (/ sum (- end start)))))))

FOO

* (foo "a" "abcabcabc")
33.33% of the string was inside of a match
NIL
* (foo "aa|b" "aacabcbbc")
55.56% of the string was inside of a match
NIL

&optional
&key
* (defun crossfoot (target-string &key (start 0) (end (length target-string)))
    (let ((sum 0))
      (do-matches-as-strings (m :digit-class
                                         target-string nil
                                         :start start :end end)
        (incf sum (parse-integer m)))
      (if (< sum 10)
        sum
        (crossfoot (format nil "~A" sum)))))

CROSSFOOT

* (crossfoot "bar")
0

* (crossfoot "a3x")
3

* (crossfoot "12345")
6

&optional
&key
* (do-register-groups (first second third fourth)
      ("((a)|(b)|(c))" "abababc" nil :start 2 :sharedp t)
    (print (list first second third fourth)))
("a" "a" NIL NIL) 
("b" NIL "b" NIL) 
("a" "a" NIL NIL) 
("b" NIL "b" NIL) 
("c" NIL NIL "c")
NIL

* (let (result)
    (do-register-groups ((#'parse-integer n) (#'intern sign) whitespace)
        ("(\\d+)|(\\+|-|\\*|/)|(\\s+)" "12*15 - 42/3")
      (unless whitespace
        (push (or n sign) result)))
    (nreverse result))
(12 * 15 - 42 / 3)

&key
* (all-matches "a" "foo bar baz")
(5 6 9 10)

* (all-matches "\\w*" "foo bar baz")
(0 3 3 3 4 7 7 7 8 11 11 11)

&key
* (all-matches-as-strings "a" "foo bar baz")
("a" "a")

* (all-matches-as-strings "\\w*" "foo bar baz")
("foo" "" "bar" "" "baz" "")

&key
* (split "\\s+" "foo   bar baz
frob")
("foo" "bar" "baz" "frob")

* (split "\\s*" "foo bar   baz")
("f" "o" "o" "b" "a" "r" "b" "a" "z")

* (split "(\\s+)" "foo bar   baz")
("foo" "bar" "baz")

* (split "(\\s+)" "foo bar   baz" :with-registers-p t)
("foo" " " "bar" "   " "baz")

* (split "(\\s)(\\s*)" "foo bar   baz" :with-registers-p t)
("foo" " " "" "bar" " " "  " "baz")

* (split "(,)|(;)" "foo,bar;baz" :with-registers-p t)
("foo" "," NIL "bar" NIL ";" "baz")

* (split "(,)|(;)" "foo,bar;baz" :with-registers-p t :omit-unmatched-p t)
("foo" "," "bar" ";" "baz")

* (split ":" "a:b:c:d:e:f:g::")
("a" "b" "c" "d" "e" "f" "g")

* (split ":" "a:b:c:d:e:f:g::" :limit 1)
("a:b:c:d:e:f:g::")

* (split ":" "a:b:c:d:e:f:g::" :limit 2)
("a" "b:c:d:e:f:g::")

* (split ":" "a:b:c:d:e:f:g::" :limit 3)
("a" "b" "c:d:e:f:g::")

* (split ":" "a:b:c:d:e:f:g::" :limit 1000)
("a" "b" "c" "d" "e" "f" "g" "" "")

&key
* (regex-replace "fo+" "foo bar" "frob")
"frob bar"
T

* (regex-replace "fo+" "FOO bar" "frob")
"FOO bar"
NIL

* (regex-replace "(?i)fo+" "FOO bar" "frob")
"frob bar"
T

* (regex-replace "(?i)fo+" "FOO bar" "frob" :preserve-case t)
"FROB bar"
T

* (regex-replace "(?i)fo+" "Foo bar" "frob" :preserve-case t)
"Frob bar"
T

* (regex-replace "bar" "foo bar baz" "[frob (was '\\&' between '\\`' and '\\'')]")
"foo [frob (was 'bar' between 'foo ' and ' baz')] baz"
T

* (regex-replace "bar" "foo bar baz"
                          '("[frob (was '" :match "' between '" :before-match "' and '" :after-match "')]"))
"foo [frob (was 'bar' between 'foo ' and ' baz')] baz"
T

* (regex-replace "(be)(nev)(o)(lent)"
                          "benevolent: adj. generous, kind"
                          #'(lambda (match &rest registers)
                              (format nil "~A [~{~A~^.~}]" match registers))
                          :simple-calls t)
"benevolent [be.nev.o.lent]: adj. generous, kind"
T

&key
* (regex-replace-all "(?i)fo+" "foo Fooo FOOOO bar" "frob" :preserve-case t)
"frob Frob FROB bar"
T

* (regex-replace-all "(?i)f(o+)" "foo Fooo FOOOO bar" "fr\\1b" :preserve-case t)
"froob Frooob FROOOOB bar"
T

* (let ((qp-regex (create-scanner "[\\x80-\\xff]")))
    (defun encode-quoted-printable (string)
      "Converts 8-bit string to quoted-printable representation."
      ;; won't work for Corman Lisp because non-ASCII characters aren't 8-bit there
      (flet ((convert (target-string start end match-start match-end reg-starts reg-ends)
             (declare (ignore start end match-end reg-starts reg-ends))
             (format nil "=~2,'0x" (char-code (char target-string match-start)))))
        (regex-replace-all qp-regex string #'convert))))
Converted ENCODE-QUOTED-PRINTABLE.
ENCODE-QUOTED-PRINTABLE

* (encode-quoted-printable "Fête Sørensen naïve Hühner Straße")
"F=EAte S=F8rensen na=EFve H=FChner Stra=DFe"
T

* (let ((url-regex (create-scanner "[^a-zA-Z0-9_\\-.]")))
    (defun url-encode (string)
      "URL-encodes a string."
      ;; won't work for Corman Lisp because non-ASCII characters aren't 8-bit there
      (flet ((convert (target-string start end match-start match-end reg-starts reg-ends)
             (declare (ignore start end match-end reg-starts reg-ends))
             (format nil "%~2,'0x" (char-code (char target-string match-start)))))
        (regex-replace-all url-regex string #'convert))))
Converted URL-ENCODE.
URL-ENCODE

* (url-encode "Fête Sørensen naïve Hühner Straße")
"F%EAte%20S%F8rensen%20na%EFve%20H%FChner%20Stra%DFe"
T

* (defun how-many (target-string start end match-start match-end reg-starts reg-ends)
    (declare (ignore start end match-start match-end))
    (format nil "~A" (- (svref reg-ends 0)
                        (svref reg-starts 0))))
HOW-MANY

* (regex-replace-all "{(.+?)}"
                              "foo{...}bar{.....}{..}baz{....}frob"
                              (list "[" 'how-many " dots]"))
"foo[3 dots]bar[5 dots][2 dots]baz[4 dots]frob"
T

* (let ((qp-regex (create-scanner "[\\x80-\\xff]")))
    (defun encode-quoted-printable (string)
      "Converts 8-bit string to quoted-printable representation.
Version using SIMPLE-CALLS keyword argument."
      ;; ;; won't work for Corman Lisp because non-ASCII characters aren't 8-bit there
      (flet ((convert (match)
               (format nil "=~2,'0x" (char-code (char match 0)))))
        (regex-replace-all qp-regex string #'convert
                                    :simple-calls t))))

Converted ENCODE-QUOTED-PRINTABLE.
ENCODE-QUOTED-PRINTABLE

* (encode-quoted-printable "Fête Sørensen naïve Hühner Straße")
"F=EAte S=F8rensen na=EFve H=FChner Stra=DFe"
T

* (defun how-many (match first-register)
    (declare (ignore match))
    (format nil "~A" (length first-register)))
HOW-MANY

* (regex-replace-all "{(.+?)}"
                              "foo{...}bar{.....}{..}baz{....}frob"
                              (list "[" 'how-many " dots]")
                              :simple-calls t)

"foo[3 dots]bar[5 dots][2 dots]baz[4 dots]frob"
T

* (labels ((char-code-odd-p (char)
             (oddp (char-code char)))
           (char-code-even-p (char)
             (evenp (char-code char)))
           (resolver (name)
             (cond ((string= name "odd") #'char-code-odd-p)
                   ((string= name "even") #'char-code-even-p)
                   ((string= name "true") (constantly t))
                   (t (error "Can't resolve ~S." name)))))
    (let ((*property-resolver* #'resolver))
      ;; quiz question - why do we need CREATE-SCANNER here?
      (list (regex-replace-all (create-scanner "\\p{odd}") "abcd" "+")
            (regex-replace-all (create-scanner "\\p{even}") "abcd" "+")
            (regex-replace-all (create-scanner "\\p{true}") "abcd" "+"))))
("+b+d" "a+c+" "++++")

(setf (
)
)
* (parse-string "a*b+")
(:SEQUENCE (:GREEDY-REPETITION 0 NIL #\a) (:GREEDY-REPETITION 1 NIL #\b))

* (defun my-repetition (char min)
    `(:greedy-repetition ,min nil ,char))
MY-REPETITION

* (setf (parse-tree-synonym 'a*) (my-repetition #\a 0))
(:GREEDY-REPETITION 0 NIL #\a)

* (setf (parse-tree-synonym 'b+) (my-repetition #\b 1))
(:GREEDY-REPETITION 1 NIL #\b)

* (let ((scanner (create-scanner '(:sequence a* b+))))
    (dolist (string '("ab" "b" "aab" "a" "x"))
      (print (scan scanner string)))
    (values))
0
0
0
NIL
NIL

* (parse-tree-synonym 'a*)
(:GREEDY-REPETITION 0 NIL #\a)

* (parse-tree-synonym 'a+)
NIL

(defmacro define-parse-tree-synonym (name parse-tree)
  `(eval-when (:compile-toplevel :load-toplevel :execute)
     (setf (parse-tree-synonym ',name) ',parse-tree)))

(define-parse-tree-synonym a-z
  (:char-class (:range #\a #\z) (:range #\A #\Z)))

(define-parse-tree-synonym a-z*
  (:greedy-repetition 0 nil a-z))

(defun ascii-char-tester (string)
  (scan '(:sequence :start-anchor a-z* :end-anchor)
        string))

* (scan "^a+$" "a+")
NIL

* (let ((*allow-quoting* t))
    ;;we use CREATE-SCANNER because of Lisps like SBCL that don't have an interpreter
    (scan (create-scanner "^\\Qa+\\E$") "a+"))
0
2
#()
#()

* (let ((*allow-quoting* t))
    (scan (create-scanner "\\Qa()\\E(?#comment\\Q)a**b") "()ab"))

Quantifier '*' not allowed at position 19 in string "a\\(\\)(?#commentQ)a**b"

* (scan '(:sequence :start-anchor
                    "a+" ;; no quoting necessary
                    :end-anchor)
        "a+")
0
2
#()
#()

;; Perl compatible mode (*ALLOW-NAMED-REGISTERS* is NIL)
* (create-scanner "(?<reg>.*)")
Character 'r' may not follow '(?<' at position 3 in string "(?<reg>)"

;; just unescapes "\\k"
* (parse-string "\\k<reg>")
"k<reg>"

* (setq *allow-named-registers* t)
T

* (create-scanner "((?<small>[a-z]*)(?<big>[A-Z]*))")
#<CLOSURE (LAMBDA (STRING CL-PPCRE::START CL-PPCRE::END)) {AD75BFD}>
(NIL "small" "big")

;; the scanner doesn't capture any information about named groups -
;; you have to store the second value returned from CREATE-SCANNER yourself
* (scan * "aaaBBB")
0
6
#(0 0 3)
#(6 3 6)

;; parse tree syntax
* (parse-string "((?<small>[a-z]*)(?<big>[A-Z]*))")
(:REGISTER
 (:SEQUENCE
  (:NAMED-REGISTER "small"
   (:GREEDY-REPETITION 0 NIL (:CHAR-CLASS (:RANGE #\a #\z))))
  (:NAMED-REGISTER "big"
   (:GREEDY-REPETITION 0 NIL (:CHAR-CLASS (:RANGE #\A #\Z))))))

* (create-scanner *)
#<CLOSURE (LAMBDA (STRING CL-PPCRE::START CL-PPCRE::END)) {B158E3D}>
(NIL "small" "big")

;; multiple-choice back-reference
* (scan "^(?<reg>[ab])(?<reg>[12])\\k<reg>\\k<reg>$" "a1aa")
0
4
#(0 1)
#(1 2)

* (scan "^(?<reg>[ab])(?<reg>[12])\\k<reg>\\k<reg>$" "a22a")
0
4
#(0 1)
#(1 2)

;; demonstrating most-recently-seen-register-first property of back-reference;
;; "greedy" regex (analogous to "aa?")
* (scan "^(?<reg>)(?<reg>a)(\\k<reg>)" "a")
0
1
#(0 0 1)
#(0 1 1)

* (scan "^(?<reg>)(?<reg>a)(\\k<reg>)" "aa")
0
2
#(0 0 1)
#(0 1 2)

;; switched groups
;; "lazy" regex (analogous to "aa??")
* (scan "^(?<reg>a)(?<reg>)(\\k<reg>)" "a")
0
1
#(0 1 1)
#(1 1 1)

;; scanner ignores the second "a"
* (scan "^(?<reg>a)(?<reg>)(\\k<reg>)" "aa")
0
1
#(0 1 1)
#(1 1 1)

;; "aa" will be matched only when forced by adding "$" at the end
* (scan "^(?<reg>a)(?<reg>)(\\k<reg>)$" "aa")
0
2
#(0 1 1)
#(1 1 2)

&key
* (quote-meta-chars "[a-z]*")
"\\[a\\-z\\]\\*"

&optional
&key
* *package*
#<The COMMON-LISP-USER package, 16/21 internal, 0/9 external>

* (defun foo (n &optional (k 0)) (+ 3 n k))
FOO

* (defparameter foo "bar")
FOO

* (defparameter |foobar| 42)
|foobar|

* (defparameter fooboo 43)
FOOBOO

* (defclass frobar () ())
#<STANDARD-CLASS FROBAR {4874E625}>

* (regex-apropos "foo(?:bar)?")
FOO [variable] value: "bar"
    [compiled function] (N &OPTIONAL (K 0))
FOOBOO [variable] value: 43
|foobar| [variable] value: 42

* (regex-apropos "(?:foo|fro)bar")
PCL::|COMMON-LISP-USER::FROBAR class predicate| [compiled closure]
FROBAR [class] #<STANDARD-CLASS FROBAR {4874E625}>
|foobar| [variable] value: 42

* (regex-apropos "(?:foo|fro)bar" 'cl-user)
FROBAR [class] #<STANDARD-CLASS FROBAR {4874E625}>
|foobar| [variable] value: 42

* (regex-apropos "(?:foo|fro)bar" '(pcl ext))
PCL::|COMMON-LISP-USER::FROBAR class predicate| [compiled closure]

* (regex-apropos "foo")
FOO [variable] value: "bar"
    [compiled function] (N &OPTIONAL (K 0))
FOOBOO [variable] value: 43
|foobar| [variable] value: 42

* (regex-apropos "foo" nil :case-insensitive nil)
|foobar| [variable] value: 42

&optional
&key
* (regex-apropos-list "foo(?:bar)?")
(|foobar| FOOBOO FOO)

* (handler-case
    (scan "foo**x" "fooox")
    (ppcre-syntax-error (condition)
      (format t "Houston, we've got a problem with the string ~S:~%~
                 Looks like something went wrong at position ~A.~%~
                 The last message we received was \"~?\"."
              (ppcre-syntax-error-string condition)
              (ppcre-syntax-error-pos condition)
              (simple-condition-format-control condition)
              (simple-condition-format-arguments condition))
      (values)))
Houston, we've got a problem with the string "foo**x":
Looks like something went wrong at position 4.
The last message we received was "Quantifier '*' not allowed.".

(asdf:oos 'asdf:load-op :cl-ppcre-unicode)

* (scan-to-strings "\\p{Script:Latin}+" "0+AB_*")
"AB"
#()

* (defun my-info-filter (pos)
    "Show some info about the matching process."
    (format t "Called at position ~A~%" pos)
    (loop with dim = (array-dimension cl-ppcre::*reg-starts* 0)
          for i below dim
          for reg-start = (aref cl-ppcre::*reg-starts* i)
          for reg-end = (aref cl-ppcre::*reg-ends* i)
          do (format t "Register ~A is currently " (1+ i))
          when reg-start
               (write-string cl-ppcre::*string* nil
            do (write-char #\')
               (write-string cl-ppcre::*string* nil
                     :start reg-start :end reg-end)
               (write-char #\')
          else
            do (write-string "unbound")
          do (terpri))
    (terpri)
    pos)
MY-INFO-FILTER

* (scan '(:sequence
           (:register
             (:greedy-repetition 0 nil
                                 (:char-class (:range #\a #\z))))
           (:filter my-info-filter 0) "X")
        "bYcdeX")
Called at position 1
Register 1 is currently 'b'

Called at position 0
Register 1 is currently ''

Called at position 1
Register 1 is currently ''

Called at position 5
Register 1 is currently 'cde'

2
6
#(2)
#(5)

* (scan '(:sequence
           (:register
             (:greedy-repetition 0 nil
                                 (:char-class (:range #\a #\z))))
           (:filter my-info-filter 0) "X")
        "bYcdeZ")
NIL

* (defun my-weird-filter (pos)
    "Only match at this point if either pos is odd and the character
  we're looking at is lowercase or if pos is even and the next two
  characters we're looking at are uppercase. Consume these characters if
  there's a match."
    (format t "Trying at position ~A~%" pos)
    (cond ((and (oddp pos)
                (< pos cl-ppcre::*end-pos*)
                (lower-case-p (char cl-ppcre::*string* pos)))
           (1+ pos))
          ((and (evenp pos)
                (< (1+ pos) cl-ppcre::*end-pos*)
                (upper-case-p (char cl-ppcre::*string* pos))
                (upper-case-p (char cl-ppcre::*string* (1+ pos))))
           (+ pos 2))
          (t nil)))
MY-WEIRD-FILTER

* (defparameter *weird-regex*
                `(:sequence "+" (:filter ,#'my-weird-filter) "+"))
*WEIRD-REGEX*

* (scan *weird-regex* "+A++a+AA+")
Trying at position 1
Trying at position 3
Trying at position 4
Trying at position 6
5
9
#()
#()

* (fmakunbound 'my-weird-filter)
MY-WEIRD-FILTER

* (scan *weird-regex* "+A++a+AA+")
Trying at position 1
Trying at position 3
Trying at position 4
Trying at position 6
5
9
#()
#()

#!/usr/bin/perl -l

$a = '\E*';
print 1
  if '\E*\E*' =~ /(?:\Q$a\E){2}/;

* (let ((*allow-quoting* t)
        (a "\\E*"))
    (scan (concatenate 'string "(?:\\Q" a "\\E){2}") "\\E*\\E*"))
Quantifier '*' not allowed at position 3 in string "(?:*\\E){2}"

* (let ((a "\\E*"))
    (scan (concatenate 'string "(?:" (quote-meta-chars a) "){2}") "\\E*\\E*"))
0
6
#()
#()

* (let ((a "\\E*"))
    (scan `(:greedy-repetition 2 2 ,a) "\\E*\\E*"))
0
6
#()
#()

* (let ((a "y\\y"))
    (scan a a))
NIL

'y\y' =~ /y\y/;

$a = 'y\y';
$a =~ /$a/;

(push :use-acl-regexp2-engine *features*)

CL-USER 1 > (scan-to-strings "<=|<" "<=")
"<="
#()

CL-USER 2 > (scan-to-strings "<|<=" "<=")
"<"
#()

(defun regex-match (regex target)
  ;; don't do that!
  (scan regex target))

(let ((target (make-string 10000 :initial-element #\a))
      (scanner-1 (create-scanner "a*\\d"))
      (scanner-2 (create-scanner "(?>a*)\\d")))
  (time (scan scanner-1 target))
  (time (scan scanner-2 target)))

CL-PPCRE - Portable Perl-compatible regular expressions for Common Lisp

Abstract

Contents

Download and installation

Support and mailing lists

The CL-PPCRE dictionary

Scanning

Splitting and replacing

Modifying scanner behaviour

Miscellaneous

Conditions

Unicode properties

Filters

Compatibility with Perl

Empty strings instead of `undef` in `$1`, `$2`, etc.

Strange scoping of embedded modifiers

Inconsistent capturing of `$1`, `$2`, etc.

Captured groups not available outside of look-aheads and look-behinds

Alternations don't always work from left to right

Different names for Unicode properties

`"\r"` doesn't work with MCL

What about `"\w"`?

Bugs and problems

`"\Q"` doesn't work, or does it?

Backslashes may confuse you...

AllegroCL compatibility mode

Hints, comments, performance considerations

Acknowledgements