;; -*- lisp -*- (in-package :parse-html) (defparameter *whitelist* (mapcar (lambda (tag-spec) (list* (first tag-spec) (cons :class 'sanitize-class) (cons :id 'sanitize-class) (rest tag-spec))) '((:a (:href . sanitize-href)) (:blockquote) (:dl) (:dt) (:dd) (:h1) (:h2) (:h3) (:h4) (:h5) (:h6) (:p) (:ul) (:ol) (:li) (:table (:width) (:height) (:border) (:cellpadding) (:cellspacing)) (:thead (:align . sanitize-align) (:valign . sanitize-valign)) (:tfoot (:align . sanitize-align) (:valign . sanitize-valign)) (:tbody (:align . sanitize-align) (:valign . sanitize-valign)) (:tr (:align . sanitize-align) (:valign . sanitize-valign)) (:th (:align . sanitize-align) (:width) (:valign sanitize-valign) (:colspan) (:rowspan)) (:td (:align . sanitize-align) (:width) (:valign . sanitize-valign) (:colspan) (:rowspan)) (:pre) (:div) (:hr) (:img (:src . sanitize-href) (:width) (:height) (:alt)) (:span) (:b) (:i) (:u) (:strong) (:em) (:code) (:br))) "List of tags and attributes allowed in HTML code. If a tag is lot listed here it is escaped, if an attribute is not listed here it is dropped.") (defparameter *a-zA-Z0-9_* "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_ -") (defun sanitize-href (href) "Strips HREF of any javascript. Returns either a sanitized string or NIL if HREF is unsanatizable." (macrolet ((starts-with (prefix) `(and (< ,(length prefix) (length href)) ,@(loop for prefix-char across prefix for index upfrom 0 collect `(char= ,prefix-char (aref href ,index)))))) (if (or (starts-with "http:/") (starts-with "https:/") (starts-with "ftp:/") (starts-with "mailto:")) href nil))) (defun sanitize-class (class) (loop for char across class unless (position char *a-zA-Z0-9_*) do (return-from sanitize-class nil)) class) (defun sanitize-align (align) (if (or (string= "left" align) (string= "center" align) (string= "justify" align) (string= "middle" align) (string= "right" align)) align nil)) (defun sanitize-valign (valign) (if (or (string= "top" valign) (string= "center" valign) (string= "middle" valign) (string= "bottom" valign)) valign nil)) (defun sanitize-attributes (tag-name attributes whitelist) (let ((allowed-attributes (cdr (assoc tag-name whitelist)))) (loop for (key value) on attributes by #'cddr if (assoc key allowed-attributes) nconc (list key (let ((attribute-sanitizer (assoc key allowed-attributes))) (or (funcall (or (cdr attribute-sanitizer) #'identity) value) "")))))) (defun sanitize (html &key (whitelist *whitelist*) (stream nil)) (let ((html (parse-html html :parse-entities nil))) (labels ((tag-callback (tag-name attributes body) (if (assoc tag-name whitelist) ;; tag is allowed (cons (cons tag-name (sanitize-attributes tag-name attributes whitelist)) (remove-if #'null (mapcar (lambda (tag) (walk-lhtml tag #'tag-callback #'string-callback)) body))) ;; tag is not allowed '())) (string-callback (string) string)) (let ((sanitized (loop for tag in html for walked = (walk-lhtml tag #'tag-callback #'string-callback) when walked collect it))) (if stream (dolist (lhtml sanitized) (write-lhtml lhtml stream)) (with-output-to-string (stream) (dolist (lhtml sanitized) (write-lhtml lhtml stream))))))))