/[climacs]/climacs/html-syntax.lisp
ViewVC logotype

Contents of /climacs/html-syntax.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (show annotations)
Wed Feb 2 08:01:30 2005 UTC (9 years, 2 months ago) by rstrandh
Branch: MAIN
Implemented an embryonic html-syntax module that uses the incremental
Earley parser.

Implemented a command (backward-to-error) to illustrate how
the module can be used.  I am not happy with my syntax yet, though.
It reports too many errors.
1 ;;; -*- Mode: Lisp; Package: COMMON-LISP-USER -*-
2
3 ;;; (c) copyright 2005 by
4 ;;; Robert Strandh (strandh@labri.fr)
5
6 ;;; This library is free software; you can redistribute it and/or
7 ;;; modify it under the terms of the GNU Library General Public
8 ;;; License as published by the Free Software Foundation; either
9 ;;; version 2 of the License, or (at your option) any later version.
10 ;;;
11 ;;; This library is distributed in the hope that it will be useful,
12 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 ;;; Library General Public License for more details.
15 ;;;
16 ;;; You should have received a copy of the GNU Library General Public
17 ;;; License along with this library; if not, write to the
18 ;;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 ;;; Boston, MA 02111-1307 USA.
20
21 ;;; Syntax for analysing HTML
22
23 (in-package :climacs-syntax) ;;; Put this in a separate package once it works
24
25 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
26 ;;;
27 ;;; grammar classes
28
29 (defclass html-sym ()
30 ((badness :initform 0 :initarg :badness :reader badness)
31 (message :initform "" :initarg :message :reader message)))
32
33 (defmethod parse-tree-better ((t1 html-sym) (t2 html-sym))
34 (and (eq (class-of t1) (class-of t2))
35 (< (badness t1) (badness t2))))
36
37 (defclass html (html-sym) ())
38 (defclass head (html-sym) ())
39 (defclass title (html-sym) ())
40 (defclass body (html-sym) ())
41 (defclass h1 (html-sym) ())
42 (defclass h2 (html-sym) ())
43 (defclass h3 (html-sym) ())
44 (defclass para (html-sym) ())
45 (defclass ul (html-sym) ())
46 (defclass li (html-sym) ())
47 (defclass texts (html-sym) ())
48
49 (defclass error-token (html-sym) ())
50 (defclass text (html-sym) ())
51
52 (defclass <html> (html-sym) ())
53 (defclass </html> (html-sym) ())
54 (defclass <head> (html-sym) ())
55 (defclass </head> (html-sym) ())
56 (defclass <title> (html-sym) ())
57 (defclass </title> (html-sym) ())
58 (defclass <body> (html-sym) ())
59 (defclass </body> (html-sym) ())
60 (defclass <h1> (html-sym) ())
61 (defclass </h1> (html-sym) ())
62 (defclass <h2> (html-sym) ())
63 (defclass </h2> (html-sym) ())
64 (defclass <h3> (html-sym) ())
65 (defclass </h3> (html-sym) ())
66 (defclass <p> (html-sym) ())
67 (defclass </p> (html-sym) ())
68 (defclass <ul> (html-sym) ())
69 (defclass </ul> (html-sym) ())
70 (defclass <li> (html-sym) ())
71 (defclass </li> (html-sym) ())
72
73 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
74 ;;;
75 ;;; lexer
76
77 (defparameter *token-table*
78 '(("<html>" . <html>)
79 ("</html>" . </html>)
80 ("<head>" . <head>)
81 ("</head>" . </head>)
82 ("<title>" . <title>)
83 ("</title>" . </title>)
84 ("<body>" . <body>)
85 ("</body>" . </body>)
86 ("<h1>" . <h1>)
87 ("</h1>" . </h1>)
88 ("<h2>" . <h2>)
89 ("</h2>" . </h2>)
90 ("<h3>" . <h3>)
91 ("</h3>" . </h3>)
92 ("<p>" . <p>)
93 ("</p>" . </p>)
94 ("<ul>" . <ul>)
95 ("</ul>" . </ul>)
96 ("<li>" . <li>)
97 ("</li>" . </li>)))
98
99 (defclass html-lexer (lexer)
100 ((mark :initarg :mark)))
101
102 (defmethod lex ((lexer html-lexer))
103 (with-slots (mark) lexer
104 (assert (not (end-of-buffer-p mark)))
105 (cond ((or (end-of-line-p mark)
106 (not (eql (object-after mark) #\<)))
107 (when (end-of-line-p mark)
108 (forward-object mark))
109 (loop until (or (end-of-line-p mark)
110 (eql (object-after mark) #\<))
111 do (forward-object mark))
112 (make-instance 'text))
113 (t
114 (let ((offset (offset mark)))
115 (forward-object mark)
116 (loop until (or (end-of-line-p mark)
117 (whitespacep (object-after mark))
118 (eql (object-before mark) #\>))
119 do (forward-object mark))
120 (let* ((word (region-to-sequence offset mark))
121 (class-name (cdr (assoc word *token-table* :test #'equalp))))
122 (make-instance (or class-name 'error-token))))))))
123
124 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
125 ;;;
126 ;;; parser
127
128 (defparameter *html-grammar*
129 (grammar
130 (html -> (<html> head body </html>))
131 (<html> -> (html-sym) :badness 5 :message "substituted <html>")
132 (</html> -> (html-sym) :badness 5 :message "substituted </html>")
133 (<html> -> () :badness 10 :message "missing <html> inserted")
134 (</html> -> () :badness 10 :message "missing </html> inserted")
135 (head -> (<head> title </head>))
136 (<head> -> (html-sym) :badness 5 :message "substituted <head>")
137 (</head> -> (html-sym) :badness 5 :message "substituted </head>")
138 (<head> -> () :badness 10 :message "missing <head> inserted")
139 (</head> -> () :badness 10 :message "missing </head> inserted")
140 (title -> (<title> texts </title>))
141 (<title> -> (html-sym) :badness 5 :message "substituted <title>")
142 (</title> -> (html-sym) :badness 5 :message "substituted </title>")
143 (<title> -> () :badness 10 :message "missing <title> inserted")
144 (</title> -> () :badness 10 :message "missing </title> inserted")
145 (body -> (<body> texts </body>))
146 (<body> -> (html-sym) :badness 5 :message "substituted <body>")
147 (</body> -> (html-sym) :badness 5 :message "substituted </body>")
148 (<body> -> () :badness 10 :message "missing <body> inserted")
149 (</body> -> () :badness 10 :message "missing </body> inserted")
150 (texts -> ())
151 (texts -> (texts text))))
152
153 (define-syntax html-syntax ("HTML" (basic-syntax))
154 ((parser)
155 (states)))
156
157 (defmethod initialize-instance :after ((syntax html-syntax) &rest args)
158 (declare (ignore args))
159 (with-slots (parser states buffer) syntax
160 (setf parser (make-instance 'parser
161 :grammar *html-grammar*
162 :lexer (make-instance 'html-lexer
163 :mark (make-instance 'standard-left-sticky-mark :buffer buffer))
164 :target 'html))
165 (setf states (list (cons (make-instance 'standard-left-sticky-mark :buffer buffer)
166 (initial-state parser))))))
167
168 (defmethod update-syntax (buffer (syntax html-syntax))
169 (let ((low-mark (low-mark buffer)))
170 (with-slots (parser states) syntax
171 (with-slots (lexer) parser
172 (with-slots (mark) lexer
173 (loop until (or (null (cdr states))
174 (< (offset (caar states)) (offset low-mark)))
175 do (pop states))
176 (setf (offset mark) (offset (caar states)))
177 (loop until (end-of-buffer-p mark)
178 do (let ((token (lex lexer)))
179 (push (cons (clone-mark mark)
180 (advance-parse parser (list token) (cdar states)))
181 states))))))))
182
183 (defgeneric forward-to-error (point syntax))
184 (defgeneric backward-to-error (point syntax))
185
186 (defun find-bad-parse-tree (state)
187 (maphash (lambda (key parse-trees)
188 (declare (ignore key))
189 (let ((parse-tree (find-if (lambda (parse-tree)
190 (plusp (badness parse-tree)))
191 parse-trees)))
192 (when parse-tree
193 (return-from find-bad-parse-tree parse-tree))))
194 (parse-trees state)))
195
196 (defmethod backward-to-error (point (syntax html-syntax))
197 (let ((states (slot-value syntax 'states)))
198 (loop until (or (null states)
199 (mark< (caar states) point))
200 do (pop states))
201 (loop for (mark . state) in states
202 for tree = (find-bad-parse-tree state)
203 when tree
204 do (setf (offset point) (offset mark))
205 (return (message tree))
206 finally (return "no more errors"))))

  ViewVC Help
Powered by ViewVC 1.1.5