/[climacs]/climacs/html-syntax.lisp
ViewVC logotype

Contents of /climacs/html-syntax.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (show annotations)
Tue Mar 15 04:31:59 2005 UTC (9 years, 1 month ago) by rstrandh
Branch: MAIN
Changes since 1.12: +76 -66 lines
factored out the incremental lexer from html-syntax.  The code is
still physically in the file html-syntax.lisp, but that will change
soon.
1 ;;; -*- Mode: Lisp; Package: COMMON-LISP-USER -*-
2
3 ;;; (c) copyright 2005 by
4 ;;; Robert Strandh (strandh@labri.fr)
5
6 ;;; This library is free software; you can redistribute it and/or
7 ;;; modify it under the terms of the GNU Library General Public
8 ;;; License as published by the Free Software Foundation; either
9 ;;; version 2 of the License, or (at your option) any later version.
10 ;;;
11 ;;; This library is distributed in the hope that it will be useful,
12 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 ;;; Library General Public License for more details.
15 ;;;
16 ;;; You should have received a copy of the GNU Library General Public
17 ;;; License along with this library; if not, write to the
18 ;;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 ;;; Boston, MA 02111-1307 USA.
20
21 ;;; Syntax for analysing HTML
22
23 (in-package :climacs-html-syntax)
24
25 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
26 ;;;
27 ;;; this should really go in syntax.lisp
28
29 (defclass parse-tree ()
30 ((start-mark :initarg :start-mark :reader start-mark)
31 (size :initarg :size)))
32
33 (defgeneric start-offset (parse-tree))
34
35 (defmethod start-offset ((tree parse-tree))
36 (offset (start-mark tree)))
37
38 (defgeneric end-offset (parse-tree))
39
40 (defmethod end-offset ((tree parse-tree))
41 (with-slots (start-mark size) tree
42 (+ (offset start-mark) size)))
43
44 (defclass lexer ()
45 ((buffer :initarg :buffer :reader buffer)))
46
47 (defgeneric nb-lexemes (lexer))
48 (defgeneric lexeme (lexer pos))
49 (defgeneric insert-lexeme (lexer pos lexeme))
50 (defgeneric delete-invalid-lexemes (lexer from to))
51 (defgeneric inter-lexeme-object-p (lexer object))
52 (defgeneric skip-inter-lexeme-objects (lexer scan))
53 (defgeneric update-lex (lexer start-pos end))
54
55 (defclass incremental-lexer (lexer)
56 ((lexemes :initform (make-instance 'standard-flexichain) :reader lexemes)))
57
58 (defmethod nb-lexemes ((lexer incremental-lexer))
59 (nb-elements (lexemes lexer)))
60
61 (defmethod lexeme ((lexer incremental-lexer) pos)
62 (element* (lexemes lexer) pos))
63
64 (defmethod insert-lexeme ((lexer incremental-lexer) pos lexeme)
65 (insert* (lexemes lexer) pos lexeme))
66
67 (defmethod delete-invalid-lexemes ((lexer incremental-lexer) from to)
68 "delete all lexemes between FROM and TO and return the first invalid
69 position in the lexemes of LEXER"
70 (with-slots (lexemes) lexer
71 (let ((start 1)
72 (end (nb-elements lexemes)))
73 ;; use binary search to find the first lexeme to delete
74 (loop while (< start end)
75 do (let ((middle (floor (+ start end) 2)))
76 (if (mark< (end-offset (element* lexemes middle)) from)
77 (setf start (1+ middle))
78 (setf end middle))))
79 ;; delete lexemes
80 (loop until (or (= start (nb-elements lexemes))
81 (mark> (start-mark (element* lexemes start)) to))
82 do (delete* lexemes start))
83 start)))
84
85 (defmethod skip-inter-lexeme-objects ((lexer incremental-lexer) scan)
86 (loop until (end-of-buffer-p scan)
87 while (inter-lexeme-object-p lexer (object-after scan))
88 do (forward-object scan)))
89
90 (defmethod update-lex ((lexer incremental-lexer) start-pos end)
91 (let ((scan (clone-mark (low-mark (buffer lexer)) :left)))
92 (setf (offset scan)
93 (end-offset (lexeme lexer (1- start-pos))))
94 (loop do (skip-inter-lexeme-objects lexer scan)
95 until (if (end-of-buffer-p end)
96 (end-of-buffer-p scan)
97 (mark> scan end))
98 do (let* ((start-mark (clone-mark scan))
99 (lexeme (next-lexeme scan))
100 (size (- (offset scan) (offset start-mark))))
101 (setf (slot-value lexeme 'start-mark) start-mark
102 (slot-value lexeme 'size) size)
103 (insert-lexeme lexer start-pos lexeme))
104 (incf start-pos))))
105
106 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
107 ;;;
108 ;;; grammar classes
109
110 (defclass html-sym (parse-tree)
111 ((badness :initform 0 :initarg :badness :reader badness)
112 (message :initform "" :initarg :message :reader message)))
113
114 (defmethod parse-tree-better ((t1 html-sym) (t2 html-sym))
115 (and (eq (class-of t1) (class-of t2))
116 (< (badness t1) (badness t2))))
117
118 (defclass html-nonterminal (html-sym) ())
119
120 (defclass words (html-nonterminal) ())
121
122 (defclass empty-words (words) ())
123
124 (defclass nonempty-words (words)
125 ((words :initarg :words)
126 (word :initarg :word)))
127
128 (defclass html-balanced (html-nonterminal)
129 ((start :initarg :start)
130 (end :initarg :end)))
131
132 (defclass html (html-balanced)
133 ((head :initarg :head)
134 (body :initarg :body)))
135
136 (defclass head (html-balanced)
137 ((title :initarg :title)))
138
139 (defclass html-words (html-balanced)
140 ((words :initarg :words)))
141
142 (defclass title (html-words) ())
143 (defclass body (html-words) ())
144 (defclass h1 (html-words) ())
145 (defclass h2 (html-words) ())
146 (defclass h3 (html-words) ())
147 (defclass a (html-words) ())
148 (defclass para (html-words) ())
149
150 (defclass html-token (html-sym)
151 ((ink) (face)))
152
153 (defclass html-tag (html-token) ())
154
155 (defclass <html> (html-tag) () (:default-initargs :size 6))
156 (defclass </html> (html-tag) ()(:default-initargs :size 7))
157 (defclass <head> (html-tag) () (:default-initargs :size 6))
158 (defclass </head> (html-tag) () (:default-initargs :size 7))
159 (defclass <title> (html-tag) () (:default-initargs :size 7))
160 (defclass </title> (html-tag) () (:default-initargs :size 8))
161 (defclass <body> (html-tag) () (:default-initargs :size 6))
162 (defclass </body> (html-tag) () (:default-initargs :size 7))
163 (defclass <h1> (html-tag) () (:default-initargs :size 4))
164 (defclass </h1> (html-tag) () (:default-initargs :size 5))
165 (defclass <h2> (html-tag) () (:default-initargs :size 4))
166 (defclass </h2> (html-tag) () (:default-initargs :size 5))
167 (defclass <h3> (html-tag) () (:default-initargs :size 4))
168 (defclass </h3> (html-tag) () (:default-initargs :size 5))
169 (defclass <p> (html-tag) () (:default-initargs :size 3))
170 (defclass </p> (html-tag) () (:default-initargs :size 4))
171 (defclass <ul> (html-tag) () (:default-initargs :size 4))
172 (defclass </ul> (html-tag) () (:default-initargs :size 5))
173 (defclass <li> (html-tag) () (:default-initargs :size 4))
174 (defclass </li> (html-tag) () (:default-initargs :size 5))
175 (defclass <a> (html-tag)
176 ((start :initarg :start)
177 (word :initarg :word)
178 (words :initarg :words)
179 (end :initarg :end)))
180 (defclass </a> (html-tag) () (:default-initargs :size 4))
181
182
183 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
184 ;;;
185 ;;; lexer
186
187 (defclass html-element (html-token)
188 ((state :initarg :state)))
189
190 (defclass start-element (html-element) ())
191 (defclass tag-start (html-element) ())
192 (defclass tag-end (html-element) ())
193 (defclass slash (html-element) ())
194 (defclass word (html-element) ())
195 (defclass delimiter (html-element) ())
196
197 (defun next-lexeme (scan)
198 (flet ((fo () (forward-object scan)))
199 (let ((object (object-after scan)))
200 (case object
201 (#\< (fo) (make-instance 'tag-start))
202 (#\> (fo) (make-instance 'tag-end))
203 (#\/ (fo) (make-instance 'slash))
204 (t (cond ((alphanumericp object)
205 (loop until (end-of-buffer-p scan)
206 while (alphanumericp (object-after scan))
207 do (fo))
208 (make-instance 'word))
209 (t
210 (fo) (make-instance 'delimiter))))))))
211
212 (defclass html-lexer (incremental-lexer) ())
213
214 (define-syntax html-syntax ("HTML" (basic-syntax))
215 ((lexer :reader lexer)
216 (valid-parse :initform 1)
217 (parser)))
218
219 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
220 ;;;
221 ;;; parser
222
223 (defun word-is (word string)
224 (string-equal (coerce (region-to-sequence (start-mark word) (end-offset word)) 'string)
225 string))
226
227 (defparameter *html-grammar*
228 (grammar
229 (<html> -> (tag-start
230 (word (and (= (end-offset tag-start) (start-offset word))
231 (word-is word "html")))
232 (tag-end (= (end-offset word) (start-offset tag-end))))
233 :start-mark (start-mark tag-start))
234 (</html> -> (tag-start
235 (slash (= (end-offset tag-start) (start-offset slash)))
236 (word (and (= (end-offset slash) (start-offset word))
237 (word-is word "html")))
238 (tag-end (= (end-offset word) (start-offset tag-end))))
239 :start-mark (start-mark tag-start))
240 (<head> -> (tag-start
241 (word (and (= (end-offset tag-start) (start-offset word))
242 (word-is word "head")))
243 (tag-end (= (end-offset word) (start-offset tag-end))))
244 :start-mark (start-mark tag-start))
245 (</head> -> (tag-start
246 (slash (= (end-offset tag-start) (start-offset slash)))
247 (word (and (= (end-offset slash) (start-offset word))
248 (word-is word "head")))
249 (tag-end (= (end-offset word) (start-offset tag-end))))
250 :start-mark (start-mark tag-start))
251 (<title> -> (tag-start
252 (word (and (= (end-offset tag-start) (start-offset word))
253 (word-is word "title")))
254 (tag-end (= (end-offset word) (start-offset tag-end))))
255 :start-mark (start-mark tag-start))
256 (</title> -> (tag-start
257 (slash (= (end-offset tag-start) (start-offset slash)))
258 (word (and (= (end-offset slash) (start-offset word))
259 (word-is word "title")))
260 (tag-end (= (end-offset word) (start-offset tag-end))))
261 :start-mark (start-mark tag-start))
262 (<body> -> (tag-start
263 (word (and (= (end-offset tag-start) (start-offset word))
264 (word-is word "body")))
265 (tag-end (= (end-offset word) (start-offset tag-end))))
266 :start-mark (start-mark tag-start))
267 (</body> -> (tag-start
268 (slash (= (end-offset tag-start) (start-offset slash)))
269 (word (and (= (end-offset slash) (start-offset word))
270 (word-is word "body")))
271 (tag-end (= (end-offset word) (start-offset tag-end))))
272 :start-mark (start-mark tag-start))
273 (<a> -> (tag-start
274 (word (and (= (end-offset tag-start) (start-offset word))
275 (word-is word "a")))
276 words
277 tag-end)
278 :start-mark (start-mark tag-start)
279 :size (- (end-offset tag-end) (start-offset tag-start))
280 :start tag-start :word word :words words :end tag-end)
281 (</a> -> (tag-start
282 (slash (= (end-offset tag-start) (start-offset slash)))
283 (word (and (= (end-offset slash) (start-offset word))
284 (word-is word "a")))
285 (tag-end (= (end-offset word) (start-offset tag-end))))
286 :start-mark (start-mark tag-start))
287 (html -> (<html> head body </html>)
288 :start-mark (start-mark <html>)
289 :size (- (end-offset </html>) (start-offset <html>))
290 :start <html> :head head :body body :end </html>)
291 (head -> (<head> title </head>)
292 :start-mark (start-mark <head>)
293 :size (- (end-offset </head>) (start-offset <head>))
294 :start <head> :title title :end </head>)
295 (title -> (<title> words </title>)
296 :start-mark (start-mark <title>)
297 :size (- (end-offset </title>) (start-offset <title>))
298 :start <title> :words words :end </title>)
299 (body -> (<body> words </body>)
300 :start-mark (start-mark <body>)
301 :size (- (end-offset </body>) (start-offset <body>))
302 :start <body> :words words :end </body>)
303 (a -> (<a> words </a>)
304 :start-mark (start-mark <a>)
305 :size (- (end-offset </a>) (start-offset <a>))
306 :start <a> :words words :end </a>)
307 (words -> ()
308 (make-instance 'empty-words :start-mark nil))
309 (words -> (words word)
310 (make-instance 'nonempty-words
311 :start-mark (or (start-mark words) (start-mark word))
312 :size (- (end-offset word) (offset (or (start-mark words) (start-mark word))))
313 :words words :word word))
314 (word -> (a)
315 :start-mark (start-mark a)
316 :size (- (end-offset a) (start-offset a)))
317 (word -> (delimiter)
318 :start-mark (start-mark delimiter)
319 :size (- (end-offset delimiter) (start-offset delimiter)))))
320
321
322 (defmethod initialize-instance :after ((syntax html-syntax) &rest args)
323 (declare (ignore args))
324 (with-slots (parser lexer buffer) syntax
325 (setf parser (make-instance 'parser
326 :grammar *html-grammar*
327 :target 'html))
328 (setf lexer (make-instance 'html-lexer :buffer (buffer syntax)))
329 (let ((m (clone-mark (low-mark buffer) :left)))
330 (setf (offset m) 0)
331 (insert-lexeme lexer 0 (make-instance 'start-element
332 :start-mark m
333 :size 0
334 :state (initial-state parser))))))
335
336 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
337 ;;;
338 ;;; update syntax
339
340
341 (defmethod update-syntax-for-display (buffer (syntax html-syntax) top bot)
342 (with-slots (parser lexer valid-parse) syntax
343 (loop until (= valid-parse (nb-lexemes lexer))
344 while (mark<= (end-offset (lexeme lexer valid-parse)) bot)
345 do (let ((current-token (lexeme lexer (1- valid-parse)))
346 (next-lexeme (lexeme lexer valid-parse)))
347 (setf (slot-value next-lexeme 'state)
348 (advance-parse parser (list next-lexeme) (slot-value current-token 'state))))
349 (incf valid-parse))))
350
351 (defmethod inter-lexeme-object-p ((lexer html-lexer) object)
352 (whitespacep object))
353
354 (defmethod update-syntax (buffer (syntax html-syntax))
355 (with-slots (lexer valid-parse) syntax
356 (let* ((low-mark (low-mark buffer))
357 (high-mark (high-mark buffer))
358 (first-invalid-position (delete-invalid-lexemes lexer low-mark high-mark)))
359 (setf valid-parse first-invalid-position)
360 (update-lex lexer first-invalid-position high-mark))))
361
362 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
363 ;;;
364 ;;; display
365
366 (defvar *white-space-start* nil)
367
368 (defvar *cursor-positions* nil)
369 (defvar *current-line* 0)
370
371 (defun handle-whitespace (pane buffer start end)
372 (let ((space-width (space-width pane))
373 (tab-width (tab-width pane)))
374 (loop while (< start end)
375 do (ecase (buffer-object buffer start)
376 (#\Newline (terpri pane)
377 (setf (aref *cursor-positions* (incf *current-line*))
378 (multiple-value-bind (x y) (stream-cursor-position pane)
379 (declare (ignore x))
380 y)))
381 (#\Space (stream-increment-cursor-position
382 pane space-width 0))
383 (#\Tab (let ((x (stream-cursor-position pane)))
384 (stream-increment-cursor-position
385 pane (- tab-width (mod x tab-width)) 0))))
386 (incf start))))
387
388 (defmethod display-parse-tree :around ((entity html-sym) syntax pane)
389 (with-slots (top bot) pane
390 (when (mark> (end-offset entity) top)
391 (call-next-method))))
392
393 (defmethod display-parse-tree :around ((entity empty-words) syntax pane)
394 (declare (ignore syntax pane))
395 nil)
396
397 (defmethod display-parse-tree ((entity html-token) (syntax html-syntax) pane)
398 (flet ((cache-test (t1 t2)
399 (and (eq t1 t2)
400 (eq (slot-value t1 'ink)
401 (medium-ink (sheet-medium pane)))
402 (eq (slot-value t1 'face)
403 (text-style-face (medium-text-style (sheet-medium pane)))))))
404 (updating-output (pane :unique-id entity
405 :id-test #'eq
406 :cache-value entity
407 :cache-test #'cache-test)
408 (with-slots (ink face) entity
409 (setf ink (medium-ink (sheet-medium pane))
410 face (text-style-face (medium-text-style (sheet-medium pane))))
411 (present (coerce (region-to-sequence (start-mark entity)
412 (end-offset entity))
413 'string)
414 'string
415 :stream pane)))))
416
417 (defmethod display-parse-tree :around ((entity html-tag) (syntax html-syntax) pane)
418 (with-drawing-options (pane :ink +green+)
419 (call-next-method)))
420
421 (defmethod display-parse-tree :before ((entity html-token) (syntax html-syntax) pane)
422 (handle-whitespace pane (buffer pane) *white-space-start* (start-offset entity))
423 (setf *white-space-start* (end-offset entity)))
424
425 (defmethod display-parse-tree :before ((entity html-balanced) (syntax html-syntax) pane)
426 (with-slots (start) entity
427 (display-parse-tree start syntax pane)))
428
429 (defmethod display-parse-tree :after ((entity html-balanced) (syntax html-syntax) pane)
430 (with-slots (end) entity
431 (display-parse-tree end syntax pane)))
432
433 (defmethod display-parse-tree :around ((entity title) (syntax html-syntax) pane)
434 (with-text-face (pane :bold)
435 (call-next-method)))
436
437 (defmethod display-parse-tree ((entity html-words) (syntax html-syntax) pane)
438 (with-slots (words) entity
439 (display-parse-tree words syntax pane)))
440
441 (defmethod display-parse-tree ((entity empty-words) (syntax html-syntax) pane)
442 (declare (ignore pane))
443 nil)
444
445 (defmethod display-parse-tree ((entity nonempty-words) (syntax html-syntax) pane)
446 (with-slots (words word) entity
447 (display-parse-tree words syntax pane)
448 (display-parse-tree word syntax pane)))
449
450 (defmethod display-parse-tree ((entity html) (syntax html-syntax) pane)
451 (with-slots (head body) entity
452 (display-parse-tree head syntax pane)
453 (display-parse-tree body syntax pane)))
454
455 (defmethod display-parse-tree ((entity head) (syntax html-syntax) pane)
456 (with-slots (title) entity
457 (display-parse-tree title syntax pane)))
458
459 (defmethod display-parse-tree ((entity <a>) (syntax html-syntax) pane)
460 (with-slots (start word words end) entity
461 (display-parse-tree start syntax pane)
462 (display-parse-tree word syntax pane)
463 (display-parse-tree words syntax pane)
464 (display-parse-tree end syntax pane)))
465
466 (defgeneric display-parse-stack (symbol stack syntax pane))
467
468 (defmethod display-parse-stack (symbol stack (syntax html-syntax) pane)
469 (let ((next (parse-stack-next stack)))
470 (unless (null next)
471 (display-parse-stack (parse-stack-symbol next) next syntax pane))
472 (loop for parse-tree in (reverse (parse-stack-parse-trees stack))
473 do (display-parse-tree parse-tree syntax pane))))
474
475 (defun display-parse-state (state syntax pane)
476 (let ((top (parse-stack-top state)))
477 (if (not (null top))
478 (display-parse-stack (parse-stack-symbol top) top syntax pane)
479 (display-parse-tree (target-parse-tree state) syntax pane))))
480
481 (defmethod redisplay-pane-with-syntax ((pane climacs-pane) (syntax html-syntax) current-p)
482 (with-slots (top bot) pane
483 (setf *cursor-positions* (make-array (1+ (number-of-lines-in-region top bot)))
484 *current-line* 0
485 (aref *cursor-positions* 0) (stream-cursor-position pane))
486 (with-slots (lexer) syntax
487 (let ((average-token-size (max (float (/ (size (buffer pane)) (nb-lexemes lexer)))
488 1.0)))
489 ;; find the last token before bot
490 (let ((end-token-index (max (floor (/ (offset bot) average-token-size)) 1)))
491 ;; go back to a token before bot
492 (loop until (mark<= (end-offset (lexeme lexer (1- end-token-index))) bot)
493 do (decf end-token-index))
494 ;; go forward to the last token before bot
495 (loop until (or (= end-token-index (nb-lexemes lexer))
496 (mark> (start-offset (lexeme lexer end-token-index)) bot))
497 do (incf end-token-index))
498 (let ((start-token-index end-token-index))
499 ;; go back to the first token after top, or until the previous token
500 ;; contains a valid parser state
501 (loop until (or (mark<= (end-offset (lexeme lexer (1- start-token-index))) top)
502 (not (parse-state-empty-p
503 (slot-value (lexeme lexer (1- start-token-index)) 'state))))
504 do (decf start-token-index))
505 (let ((*white-space-start* (offset top)))
506 ;; display the parse tree if any
507 (unless (parse-state-empty-p (slot-value (lexeme lexer (1- start-token-index)) 'state))
508 (display-parse-state (slot-value (lexeme lexer (1- start-token-index)) 'state)
509 syntax
510 pane))
511 ;; display the lexemes
512 (with-drawing-options (pane :ink +red+)
513 (loop while (< start-token-index end-token-index)
514 do (let ((token (lexeme lexer start-token-index)))
515 (display-parse-tree token syntax pane))
516 (incf start-token-index))))))))
517 (let* ((cursor-line (number-of-lines-in-region top (point pane)))
518 (height (text-style-height (medium-text-style pane) pane))
519 (cursor-y (+ (* cursor-line (+ height (stream-vertical-spacing pane)))))
520 (cursor-column (column-number (point pane)))
521 (cursor-x (* cursor-column (text-style-width (medium-text-style pane) pane))))
522 (updating-output (pane :unique-id -1)
523 (draw-rectangle* pane
524 (1- cursor-x) (- cursor-y (* 0.2 height))
525 (+ cursor-x 2) (+ cursor-y (* 0.8 height))
526 :ink (if current-p +red+ +blue+))))))
527

  ViewVC Help
Powered by ViewVC 1.1.5