;; -*- lisp -*-
(in-package :parse-html)
(defparameter *whitelist*
(mapcar (lambda (tag-spec)
(list* (first tag-spec)
(cons :class 'sanitize-class)
(cons :id 'sanitize-class)
(rest tag-spec)))
'((:a (:href . sanitize-href))
(:blockquote)
(:dl) (:dt) (:dd)
(:h1) (:h2) (:h3) (:h4) (:h5) (:h6)
(:p)
(:ul) (:ol) (:li)
(:table (:width) (:height) (:border) (:cellpadding) (:cellspacing))
(:thead (:align . sanitize-align) (:valign . sanitize-valign))
(:tfoot (:align . sanitize-align) (:valign . sanitize-valign))
(:tbody (:align . sanitize-align) (:valign . sanitize-valign))
(:tr (:align . sanitize-align) (:valign . sanitize-valign))
(:th (:align . sanitize-align) (:width) (:valign sanitize-valign)
(:colspan) (:rowspan))
(:td (:align . sanitize-align) (:width) (:valign . sanitize-valign)
(:colspan) (:rowspan))
(:pre)
(:div)
(:hr)
(:img (:src . sanitize-href) (:width) (:height) (:alt))
(:span)
(:b)
(:i)
(:u)
(:strong)
(:em)
(:code)
(:br)))
"List of tags and attributes allowed in HTML code. If a tag is
lot listed here it is escaped, if an attribute is not listed
here it is dropped.")
(defparameter *a-zA-Z0-9_*
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_ -")
(defun sanitize-href (href)
"Strips HREF of any javascript.
Returns either a sanitized string or NIL if HREF is
unsanatizable."
(macrolet ((starts-with (prefix)
`(and (< ,(length prefix) (length href))
,@(loop
for prefix-char across prefix
for index upfrom 0
collect `(char= ,prefix-char (aref href ,index))))))
(if (or (starts-with "http:/")
(starts-with "https:/")
(starts-with "ftp:/")
(starts-with "mailto:"))
href
nil)))
(defun sanitize-class (class)
(loop
for char across class
unless (position char *a-zA-Z0-9_*)
do (return-from sanitize-class nil))
class)
(defun sanitize-align (align)
(if (or (string= "left" align)
(string= "center" align)
(string= "justify" align)
(string= "middle" align)
(string= "right" align))
align
nil))
(defun sanitize-valign (valign)
(if (or (string= "top" valign)
(string= "center" valign)
(string= "middle" valign)
(string= "bottom" valign))
valign
nil))
(defun sanitize-attributes (tag-name attributes whitelist)
(let ((allowed-attributes (cdr (assoc tag-name whitelist))))
(loop
for (key value) on attributes by #'cddr
if (assoc key allowed-attributes)
nconc (list key (let ((attribute-sanitizer (assoc key allowed-attributes)))
(or (funcall (or (cdr attribute-sanitizer) #'identity)
value)
""))))))
(defun sanitize (html &key (whitelist *whitelist*) (stream nil))
(let ((html (parse-html html :parse-entities nil)))
(labels ((tag-callback (tag-name attributes body)
(if (assoc tag-name whitelist)
;; tag is allowed
(cons (cons tag-name (sanitize-attributes tag-name attributes whitelist))
(remove-if #'null
(mapcar (lambda (tag)
(walk-lhtml tag #'tag-callback #'string-callback))
body)))
;; tag is not allowed
'()))
(string-callback (string) string))
(let ((sanitized (loop
for tag in html
for walked = (walk-lhtml tag #'tag-callback #'string-callback)
when walked collect it)))
(if stream
(dolist (lhtml sanitized)
(write-lhtml lhtml stream))
(with-output-to-string (stream)
(dolist (lhtml sanitized)
(write-lhtml lhtml stream))))))))