/[climacs]/climacs/html-syntax.lisp
ViewVC logotype

Contents of /climacs/html-syntax.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (show annotations)
Sun Mar 13 06:55:27 2005 UTC (9 years, 1 month ago) by rstrandh
Branch: MAIN
Changes since 1.10: +99 -78 lines
A step on the way to factoring out the incremental lexer.
1 ;;; -*- Mode: Lisp; Package: COMMON-LISP-USER -*-
2
3 ;;; (c) copyright 2005 by
4 ;;; Robert Strandh (strandh@labri.fr)
5
6 ;;; This library is free software; you can redistribute it and/or
7 ;;; modify it under the terms of the GNU Library General Public
8 ;;; License as published by the Free Software Foundation; either
9 ;;; version 2 of the License, or (at your option) any later version.
10 ;;;
11 ;;; This library is distributed in the hope that it will be useful,
12 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 ;;; Library General Public License for more details.
15 ;;;
16 ;;; You should have received a copy of the GNU Library General Public
17 ;;; License along with this library; if not, write to the
18 ;;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 ;;; Boston, MA 02111-1307 USA.
20
21 ;;; Syntax for analysing HTML
22
23 (in-package :climacs-html-syntax)
24
25 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
26 ;;;
27 ;;; this should really go in syntax.lisp
28
29 (defclass parse-tree ()
30 ((start-mark :initarg :start-mark :reader start-mark)
31 (size :initarg :size)))
32
33 (defgeneric start-offset (parse-tree))
34
35 (defmethod start-offset ((tree parse-tree))
36 (offset (start-mark tree)))
37
38 (defgeneric end-offset (parse-tree))
39
40 (defmethod end-offset ((tree parse-tree))
41 (with-slots (start-mark size) tree
42 (+ (offset start-mark) size)))
43
44 (defclass lexer ()
45 ((buffer :initarg :buffer :reader buffer)))
46
47 (defgeneric nb-lexemes (lexer))
48 (defgeneric lexeme (lexer pos))
49
50 (defclass incremental-lexer (lexer)
51 ((lexemes :initform (make-instance 'standard-flexichain) :reader lexemes)))
52
53 (defmethod nb-lexemes ((lexer incremental-lexer))
54 (nb-elements (lexemes lexer)))
55
56 (defmethod lexeme ((lexer incremental-lexer) pos)
57 (element* (lexemes lexer) pos))
58
59 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
60 ;;;
61 ;;; grammar classes
62
63 (defclass html-sym (parse-tree)
64 ((badness :initform 0 :initarg :badness :reader badness)
65 (message :initform "" :initarg :message :reader message)))
66
67 (defmethod parse-tree-better ((t1 html-sym) (t2 html-sym))
68 (and (eq (class-of t1) (class-of t2))
69 (< (badness t1) (badness t2))))
70
71 (defclass html-nonterminal (html-sym) ())
72
73 (defclass words (html-nonterminal) ())
74
75 (defclass empty-words (words) ())
76
77 (defclass nonempty-words (words)
78 ((words :initarg :words)
79 (word :initarg :word)))
80
81 (defclass html-balanced (html-nonterminal)
82 ((start :initarg :start)
83 (end :initarg :end)))
84
85 (defclass html (html-balanced)
86 ((head :initarg :head)
87 (body :initarg :body)))
88
89 (defclass head (html-balanced)
90 ((title :initarg :title)))
91
92 (defclass html-words (html-balanced)
93 ((words :initarg :words)))
94
95 (defclass title (html-words) ())
96 (defclass body (html-words) ())
97 (defclass h1 (html-words) ())
98 (defclass h2 (html-words) ())
99 (defclass h3 (html-words) ())
100 (defclass a (html-words) ())
101 (defclass para (html-words) ())
102
103 (defclass html-token (html-sym)
104 ((ink) (face)))
105
106 (defclass html-tag (html-token) ())
107
108 (defclass <html> (html-tag) () (:default-initargs :size 6))
109 (defclass </html> (html-tag) ()(:default-initargs :size 7))
110 (defclass <head> (html-tag) () (:default-initargs :size 6))
111 (defclass </head> (html-tag) () (:default-initargs :size 7))
112 (defclass <title> (html-tag) () (:default-initargs :size 7))
113 (defclass </title> (html-tag) () (:default-initargs :size 8))
114 (defclass <body> (html-tag) () (:default-initargs :size 6))
115 (defclass </body> (html-tag) () (:default-initargs :size 7))
116 (defclass <h1> (html-tag) () (:default-initargs :size 4))
117 (defclass </h1> (html-tag) () (:default-initargs :size 5))
118 (defclass <h2> (html-tag) () (:default-initargs :size 4))
119 (defclass </h2> (html-tag) () (:default-initargs :size 5))
120 (defclass <h3> (html-tag) () (:default-initargs :size 4))
121 (defclass </h3> (html-tag) () (:default-initargs :size 5))
122 (defclass <p> (html-tag) () (:default-initargs :size 3))
123 (defclass </p> (html-tag) () (:default-initargs :size 4))
124 (defclass <ul> (html-tag) () (:default-initargs :size 4))
125 (defclass </ul> (html-tag) () (:default-initargs :size 5))
126 (defclass <li> (html-tag) () (:default-initargs :size 4))
127 (defclass </li> (html-tag) () (:default-initargs :size 5))
128 (defclass <a> (html-tag)
129 ((start :initarg :start)
130 (word :initarg :word)
131 (words :initarg :words)
132 (end :initarg :end)))
133 (defclass </a> (html-tag) () (:default-initargs :size 4))
134
135
136 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
137 ;;;
138 ;;; lexer
139
140 (defclass html-element (html-token)
141 ((state :initarg :state)))
142
143 (defclass start-element (html-element) ())
144 (defclass tag-start (html-element) ())
145 (defclass tag-end (html-element) ())
146 (defclass slash (html-element) ())
147 (defclass word (html-element) ())
148 (defclass delimiter (html-element) ())
149
150 (defun next-lexeme (scan)
151 (flet ((fo () (forward-object scan)))
152 (let ((object (object-after scan)))
153 (case object
154 (#\< (fo) (make-instance 'tag-start))
155 (#\> (fo) (make-instance 'tag-end))
156 (#\/ (fo) (make-instance 'slash))
157 (t (cond ((alphanumericp object)
158 (loop until (end-of-buffer-p scan)
159 while (alphanumericp (object-after scan))
160 do (fo))
161 (make-instance 'word))
162 (t
163 (fo) (make-instance 'delimiter))))))))
164
165 (define-syntax html-syntax ("HTML" (basic-syntax))
166 ((lexemes :initform (make-instance 'standard-flexichain))
167 (valid-parse :initform 1)
168 (parser)))
169
170 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
171 ;;;
172 ;;; parser
173
174 (defun word-is (word string)
175 (string-equal (coerce (region-to-sequence (start-mark word) (end-offset word)) 'string)
176 string))
177
178 (defparameter *html-grammar*
179 (grammar
180 (<html> -> (tag-start
181 (word (and (= (end-offset tag-start) (start-offset word))
182 (word-is word "html")))
183 (tag-end (= (end-offset word) (start-offset tag-end))))
184 :start-mark (start-mark tag-start))
185 (</html> -> (tag-start
186 (slash (= (end-offset tag-start) (start-offset slash)))
187 (word (and (= (end-offset slash) (start-offset word))
188 (word-is word "html")))
189 (tag-end (= (end-offset word) (start-offset tag-end))))
190 :start-mark (start-mark tag-start))
191 (<head> -> (tag-start
192 (word (and (= (end-offset tag-start) (start-offset word))
193 (word-is word "head")))
194 (tag-end (= (end-offset word) (start-offset tag-end))))
195 :start-mark (start-mark tag-start))
196 (</head> -> (tag-start
197 (slash (= (end-offset tag-start) (start-offset slash)))
198 (word (and (= (end-offset slash) (start-offset word))
199 (word-is word "head")))
200 (tag-end (= (end-offset word) (start-offset tag-end))))
201 :start-mark (start-mark tag-start))
202 (<title> -> (tag-start
203 (word (and (= (end-offset tag-start) (start-offset word))
204 (word-is word "title")))
205 (tag-end (= (end-offset word) (start-offset tag-end))))
206 :start-mark (start-mark tag-start))
207 (</title> -> (tag-start
208 (slash (= (end-offset tag-start) (start-offset slash)))
209 (word (and (= (end-offset slash) (start-offset word))
210 (word-is word "title")))
211 (tag-end (= (end-offset word) (start-offset tag-end))))
212 :start-mark (start-mark tag-start))
213 (<body> -> (tag-start
214 (word (and (= (end-offset tag-start) (start-offset word))
215 (word-is word "body")))
216 (tag-end (= (end-offset word) (start-offset tag-end))))
217 :start-mark (start-mark tag-start))
218 (</body> -> (tag-start
219 (slash (= (end-offset tag-start) (start-offset slash)))
220 (word (and (= (end-offset slash) (start-offset word))
221 (word-is word "body")))
222 (tag-end (= (end-offset word) (start-offset tag-end))))
223 :start-mark (start-mark tag-start))
224 (<a> -> (tag-start
225 (word (and (= (end-offset tag-start) (start-offset word))
226 (word-is word "a")))
227 words
228 tag-end)
229 :start-mark (start-mark tag-start)
230 :size (- (end-offset tag-end) (start-offset tag-start))
231 :start tag-start :word word :words words :end tag-end)
232 (</a> -> (tag-start
233 (slash (= (end-offset tag-start) (start-offset slash)))
234 (word (and (= (end-offset slash) (start-offset word))
235 (word-is word "a")))
236 (tag-end (= (end-offset word) (start-offset tag-end))))
237 :start-mark (start-mark tag-start))
238 (html -> (<html> head body </html>)
239 :start-mark (start-mark <html>)
240 :size (- (end-offset </html>) (start-offset <html>))
241 :start <html> :head head :body body :end </html>)
242 (head -> (<head> title </head>)
243 :start-mark (start-mark <head>)
244 :size (- (end-offset </head>) (start-offset <head>))
245 :start <head> :title title :end </head>)
246 (title -> (<title> words </title>)
247 :start-mark (start-mark <title>)
248 :size (- (end-offset </title>) (start-offset <title>))
249 :start <title> :words words :end </title>)
250 (body -> (<body> words </body>)
251 :start-mark (start-mark <body>)
252 :size (- (end-offset </body>) (start-offset <body>))
253 :start <body> :words words :end </body>)
254 (a -> (<a> words </a>)
255 :start-mark (start-mark <a>)
256 :size (- (end-offset </a>) (start-offset <a>))
257 :start <a> :words words :end </a>)
258 (words -> ()
259 (make-instance 'empty-words :start-mark nil))
260 (words -> (words word)
261 (make-instance 'nonempty-words
262 :start-mark (or (start-mark words) (start-mark word))
263 :size (- (end-offset word) (offset (or (start-mark words) (start-mark word))))
264 :words words :word word))
265 (word -> (a)
266 :start-mark (start-mark a)
267 :size (- (end-offset a) (start-offset a)))
268 (word -> (delimiter)
269 :start-mark (start-mark delimiter)
270 :size (- (end-offset delimiter) (start-offset delimiter)))))
271
272
273 (defmethod initialize-instance :after ((syntax html-syntax) &rest args)
274 (declare (ignore args))
275 (with-slots (parser lexemes buffer) syntax
276 (setf parser (make-instance 'parser
277 :grammar *html-grammar*
278 :target 'html))
279 (insert* lexemes 0 (make-instance 'start-element
280 :start-mark (make-instance 'standard-left-sticky-mark
281 :buffer buffer
282 :offset 0)
283 :size 0
284 :state (initial-state parser)))))
285
286 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
287 ;;;
288 ;;; update syntax
289
290 (defmethod update-syntax-for-display (buffer (syntax html-syntax) top bot)
291 (with-slots (parser lexemes valid-parse) syntax
292 (loop until (= valid-parse (nb-elements lexemes))
293 while (mark<= (end-offset (element* lexemes valid-parse)) bot)
294 do (let ((current-token (element* lexemes (1- valid-parse)))
295 (next-lexeme (element* lexemes valid-parse)))
296 (setf (slot-value next-lexeme 'state)
297 (advance-parse parser (list next-lexeme) (slot-value current-token 'state))))
298 (incf valid-parse))))
299
300 (defun delete-invalid-lexemes (lexemes from to)
301 "delete all lexemes between FROM and TO and return the first invalid
302 position in LEXEMES"
303 (let ((start 1)
304 (end (nb-elements lexemes)))
305 ;; use binary search to find the first lexeme to delete
306 (loop while (< start end)
307 do (let ((middle (floor (+ start end) 2)))
308 (if (mark< (end-offset (element* lexemes middle)) from)
309 (setf start (1+ middle))
310 (setf end middle))))
311 ;; delete lexemes
312 (loop until (or (= start (nb-elements lexemes))
313 (mark> (start-mark (element* lexemes start)) to))
314 do (delete* lexemes start))
315 start))
316
317
318 (defun inter-lexeme-object-p (lexemes object)
319 (declare (ignore lexemes))
320 (whitespacep object))
321
322 (defun skip-inter-lexeme-objects (lexemes scan)
323 (loop until (end-of-buffer-p scan)
324 while (inter-lexeme-object-p lexemes (object-after scan))
325 do (forward-object scan)))
326
327 (defun update-lex (lexemes start-pos end)
328 (let ((scan (make-instance 'standard-left-sticky-mark
329 :buffer (buffer end) ; FIXME, eventually use the buffer of the lexer
330 :offset (end-offset (element* lexemes (1- start-pos))))))
331 (loop do (skip-inter-lexeme-objects lexemes scan)
332 until (if (end-of-buffer-p end)
333 (end-of-buffer-p scan)
334 (mark> scan end))
335 do (let* ((start-mark (clone-mark scan))
336 (lexeme (next-lexeme scan))
337 (size (- (offset scan) (offset start-mark))))
338 (setf (slot-value lexeme 'start-mark) start-mark
339 (slot-value lexeme 'size) size)
340 (insert* lexemes start-pos lexeme))
341 (incf start-pos))))
342
343 (defmethod update-syntax (buffer (syntax html-syntax))
344 (with-slots (lexemes valid-parse) syntax
345 (let* ((low-mark (low-mark buffer))
346 (high-mark (high-mark buffer))
347 (first-invalid-position (delete-invalid-lexemes lexemes low-mark high-mark)))
348 (setf valid-parse first-invalid-position)
349 (update-lex lexemes first-invalid-position high-mark))))
350
351 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
352 ;;;
353 ;;; display
354
355 (defvar *white-space-start* nil)
356
357 (defvar *cursor-positions* nil)
358 (defvar *current-line* 0)
359
360 (defun handle-whitespace (pane buffer start end)
361 (let ((space-width (space-width pane))
362 (tab-width (tab-width pane)))
363 (loop while (< start end)
364 do (ecase (buffer-object buffer start)
365 (#\Newline (terpri pane)
366 (setf (aref *cursor-positions* (incf *current-line*))
367 (multiple-value-bind (x y) (stream-cursor-position pane)
368 (declare (ignore x))
369 y)))
370 (#\Space (stream-increment-cursor-position
371 pane space-width 0))
372 (#\Tab (let ((x (stream-cursor-position pane)))
373 (stream-increment-cursor-position
374 pane (- tab-width (mod x tab-width)) 0))))
375 (incf start))))
376
377 (defmethod display-parse-tree :around ((entity html-sym) syntax pane)
378 (with-slots (top bot) pane
379 (when (mark> (end-offset entity) top)
380 (call-next-method))))
381
382 (defmethod display-parse-tree :around ((entity empty-words) syntax pane)
383 (declare (ignore syntax pane))
384 nil)
385
386 (defmethod display-parse-tree ((entity html-token) (syntax html-syntax) pane)
387 (flet ((cache-test (t1 t2)
388 (and (eq t1 t2)
389 (eq (slot-value t1 'ink)
390 (medium-ink (sheet-medium pane)))
391 (eq (slot-value t1 'face)
392 (text-style-face (medium-text-style (sheet-medium pane)))))))
393 (updating-output (pane :unique-id entity
394 :id-test #'eq
395 :cache-value entity
396 :cache-test #'cache-test)
397 (with-slots (ink face) entity
398 (setf ink (medium-ink (sheet-medium pane))
399 face (text-style-face (medium-text-style (sheet-medium pane))))
400 (present (coerce (region-to-sequence (start-mark entity)
401 (end-offset entity))
402 'string)
403 'string
404 :stream pane)))))
405
406 (defmethod display-parse-tree :around ((entity html-tag) (syntax html-syntax) pane)
407 (with-drawing-options (pane :ink +green+)
408 (call-next-method)))
409
410 (defmethod display-parse-tree :before ((entity html-token) (syntax html-syntax) pane)
411 (handle-whitespace pane (buffer pane) *white-space-start* (start-offset entity))
412 (setf *white-space-start* (end-offset entity)))
413
414 (defmethod display-parse-tree :before ((entity html-balanced) (syntax html-syntax) pane)
415 (with-slots (start) entity
416 (display-parse-tree start syntax pane)))
417
418 (defmethod display-parse-tree :after ((entity html-balanced) (syntax html-syntax) pane)
419 (with-slots (end) entity
420 (display-parse-tree end syntax pane)))
421
422 (defmethod display-parse-tree :around ((entity title) (syntax html-syntax) pane)
423 (with-text-face (pane :bold)
424 (call-next-method)))
425
426 (defmethod display-parse-tree ((entity html-words) (syntax html-syntax) pane)
427 (with-slots (words) entity
428 (display-parse-tree words syntax pane)))
429
430 (defmethod display-parse-tree ((entity empty-words) (syntax html-syntax) pane)
431 (declare (ignore pane))
432 nil)
433
434 (defmethod display-parse-tree ((entity nonempty-words) (syntax html-syntax) pane)
435 (with-slots (words word) entity
436 (display-parse-tree words syntax pane)
437 (display-parse-tree word syntax pane)))
438
439 (defmethod display-parse-tree ((entity html) (syntax html-syntax) pane)
440 (with-slots (head body) entity
441 (display-parse-tree head syntax pane)
442 (display-parse-tree body syntax pane)))
443
444 (defmethod display-parse-tree ((entity head) (syntax html-syntax) pane)
445 (with-slots (title) entity
446 (display-parse-tree title syntax pane)))
447
448 (defmethod display-parse-tree ((entity <a>) (syntax html-syntax) pane)
449 (with-slots (start word words end) entity
450 (display-parse-tree start syntax pane)
451 (display-parse-tree word syntax pane)
452 (display-parse-tree words syntax pane)
453 (display-parse-tree end syntax pane)))
454
455 (defgeneric display-parse-stack (symbol stack syntax pane))
456
457 (defmethod display-parse-stack (symbol stack (syntax html-syntax) pane)
458 (let ((next (parse-stack-next stack)))
459 (unless (null next)
460 (display-parse-stack (parse-stack-symbol next) next syntax pane))
461 (loop for parse-tree in (reverse (parse-stack-parse-trees stack))
462 do (display-parse-tree parse-tree syntax pane))))
463
464 (defun display-parse-state (state syntax pane)
465 (let ((top (parse-stack-top state)))
466 (if (not (null top))
467 (display-parse-stack (parse-stack-symbol top) top syntax pane)
468 (display-parse-tree (target-parse-tree state) syntax pane))))
469
470 (defmethod redisplay-pane-with-syntax ((pane climacs-pane) (syntax html-syntax) current-p)
471 (with-slots (top bot) pane
472 (setf *cursor-positions* (make-array (1+ (number-of-lines-in-region top bot)))
473 *current-line* 0
474 (aref *cursor-positions* 0) (stream-cursor-position pane))
475 (with-slots (lexemes) syntax
476 (let ((average-token-size (max (float (/ (size (buffer pane)) (nb-elements lexemes)))
477 1.0)))
478 ;; find the last token before bot
479 (let ((end-token-index (max (floor (/ (offset bot) average-token-size)) 1)))
480 ;; go back to a token before bot
481 (loop until (mark<= (end-offset (element* lexemes (1- end-token-index))) bot)
482 do (decf end-token-index))
483 ;; go forward to the last token before bot
484 (loop until (or (= end-token-index (nb-elements lexemes))
485 (mark> (start-offset (element* lexemes end-token-index)) bot))
486 do (incf end-token-index))
487 (let ((start-token-index end-token-index))
488 ;; go back to the first token after top, or until the previous token
489 ;; contains a valid parser state
490 (loop until (or (mark<= (end-offset (element* lexemes (1- start-token-index))) top)
491 (not (parse-state-empty-p
492 (slot-value (element* lexemes (1- start-token-index)) 'state))))
493 do (decf start-token-index))
494 (let ((*white-space-start* (offset top)))
495 ;; display the parse tree if any
496 (unless (parse-state-empty-p (slot-value (element* lexemes (1- start-token-index)) 'state))
497 (display-parse-state (slot-value (element* lexemes (1- start-token-index)) 'state)
498 syntax
499 pane))
500 ;; display the lexemes
501 (with-drawing-options (pane :ink +red+)
502 (loop while (< start-token-index end-token-index)
503 do (let ((token (element* lexemes start-token-index)))
504 (display-parse-tree token syntax pane))
505 (incf start-token-index))))))))
506 (let* ((cursor-line (number-of-lines-in-region top (point pane)))
507 (height (text-style-height (medium-text-style pane) pane))
508 (cursor-y (+ (* cursor-line (+ height (stream-vertical-spacing pane)))))
509 (cursor-column (column-number (point pane)))
510 (cursor-x (* cursor-column (text-style-width (medium-text-style pane) pane))))
511 (updating-output (pane :unique-id -1)
512 (draw-rectangle* pane
513 (1- cursor-x) (- cursor-y (* 0.2 height))
514 (+ cursor-x 2) (+ cursor-y (* 0.8 height))
515 :ink (if current-p +red+ +blue+))))))
516

  ViewVC Help
Powered by ViewVC 1.1.5