;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: REGEX; Base: 10 -*- (in-package "REGEX") ; ; Rewrite to use parse-tree functions. ; Separate optimization from parsing. ; Add separate canonicalization and rewrite passes. ; ; Partly because I'm about to allow the caller to manipulate ; the parse trees himself, so I can't depend on the parse ; tree being in any particular format. Partly because the ; lexer needs some beefier optimizations that I can easily ; provide in the current ad-hoc scheme. And partly because ; compile-greedy-star/compile-greedy-plus are a mess and ; about to get messier. ; ; Planned organization: ; pass 1. Lexical analysis ; pass 2. Parsing ; pass 3. Canonicalize parse tree (needed because we allow ; program interaction at the parse tree level). Also ; convert (reg n ) to (seq (rstart n) (rend n)). ; Either are valid parse-tree inputs, but the (reg ...) ; syntax is nearly always more convenient, but obscures ; the cps form, complicating some optimizations. ; pass 4. Instruction selection (fast-xxx ops) ; pass 5. CPS conversion to tuple list. ; For incremental use: ; pass 6. Code (closure) generation. ; pass 7. Linking (resolve target labels to functions). ; ; For batch use (i.e. deflexer, defregex et al) ; pass 6: Sexpr generation ; ; Passes 1&2 are coroutines. ; Pass 3 should be iterated until it reaches a fixedpoint. ; Passes 4-7 are currently all rolled into one pass, which is ; a PITA to maintain, and doesn't facilitate the generation of both ; closures and code. ; ; ; Planned optimizations: ; ; Merge ALT heads and tails. ; (alt (seq a A) (seq a B) (seq b A) (seq b B)) ; --> (alt (seq a (alt A B)) (seq b (alt A B)) ; --> (seq (alt (seq a) (seq b)) (alt (seq A) (seq B))) ; DONE. ; ; For n-way ALTs, support merging subsets of the children, and n-way ; branches on leading char. This should be a big win for the lexer. ; (alt (seq a) (seq a c) (seq c)) -> (alt (seq a (alt nil (seq c))) (seq c)) ; --> (guarded-alt (a (alt nil (seq c))) ; (c )) ; Since this involves pushing down alts, it conflicts with the ; alt-merging logic, so we need a way to turn off the merging logic ; temporarily (we need to group char type branches into a sub-alt, ; canonicalize them, then re-run the alt-merge at the higher level ; to re-integrate anything that didn't migrate out. Then at simplify ; time, we look for runs of alt branches that start with char-type ; nodes, and group these into guarded-alts. ; ; Merge single-char/char-class alt clauses into char-class. ; (seq (alt (seq a) (seq b)) (alt (seq A) (seq B))) ; --> (seq (charclass "ab") (charclass "AB")) ; DONE. ; ; Merge lists of chars and strings into strings. ; DONE. ; ; Support lists of 2-char classes (common for case-insensitive matches) ; DONE ; ; Eliminate null states in sequences (caused by fully merging ; alt clause heads, and possibly others). ; DONE. ; ; Support fast alt of (|node), (|node), ; (|node), (|node) ; ; Support merging sequences of 1-valued nodes into a specialized matcher ; that takes an array of "match" functions that return either nil or the ; new pos. If any of them fail, the whole sequence fails. ; ; If the child of + is something trivial like char, seq of char, ; any, charclass, specclass, or seq of charclass, leave it as a ; + node and specialize it during the instruction selection pass. ; ; Additional features: ; Cut ; Forward/backward context matches ; Match hooks ; Acceptance functions ; ;;; ;;; Code Generator ;;; (defun compile-expr-to-matcher (parse-tree &optional str) (multiple-value-bind (matchfn numregs simplified-tree) (compile-expr-to-matchfn parse-tree) (cond (*match-simple-strings-only* (make-matcher :simple-string-matchfn matchfn :string-matchfn nil :numregs numregs :matchstr str :matchexpr simplified-tree)) (t (make-matcher :simple-string-matchfn nil :string-matchfn matchfn :numregs numregs :matchstr str :matchexpr simplified-tree))))) (defun compile-expr-to-matchfn (parse-tree &key (simplifyp t)) (let* ((numregs (1+ (compute-max-regnum parse-tree))) (simple-tree (if simplifyp (optimize-regex-tree parse-tree) parse-tree)) (instr-tree (select-instructions simple-tree))) (multiple-value-bind (start-instr cps-instrs) (gen-instr-list instr-tree) (when *regex-compile-verbose* (format t "~&~%Numregs: ~D" numregs) (format t "~&~%Simplified tree:") (pprint simple-tree) (format t "~&~%Instruction tree:") (pprint instr-tree) (format t "~&~%CPS instruction list (start = ~D):" start-instr) (pprint cps-instrs)) (let ((closure-info (gen-closures cps-instrs))) (link-closures closure-info) (let ((matchfn (make-init-closure (remove-if #'null (map 'list #'closure-info-initfn closure-info)) (resolve-instr closure-info start-instr)))) (values (make-anchored-matcher matchfn) numregs simple-tree)))))) (defun make-anchored-matcher (matchfn) #'(lambda (*str* *regs* *start* *end* *start-is-anchor* *end-is-anchor* *acceptfn* *hooks*) #-:debug-regex (declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (special *str* *regs* *start* *end* *start-is-anchor* *end-is-anchor* *acceptfn* *hooks*)) (declare (ftype (function (fixnum) t) matchfn)) (catch 'cease-matching (funcall matchfn *start*)))) ; search the parse tree, looking for the highest register in use. (defun compute-max-regnum (node) (cond ((seq-node-p node) (reduce #'max (mapcar #'compute-max-regnum (seq-node-children node)) :initial-value -1)) ((alt-node-p node) (reduce #'max (mapcar #'compute-max-regnum (alt-node-children node)) :initial-value -1)) ((kleene-node-p node) (compute-max-regnum (kleene-node-child node))) ((pkleene-node-p node) (compute-max-regnum (pkleene-node-child node))) ((optional-node-p node) (compute-max-regnum (optional-node-child node))) ((range-node-p node) (compute-max-regnum (range-node-child node))) ((backmatch-node-p node) (backmatch-node-regnum node)) ((register-node-p node) (max (register-node-regnum node) (compute-max-regnum (register-node-child node)))) ((regstart-node-p node) (regstart-node-regnum node)) ((regend-node-p node) (regend-node-regnum node)) (t 0))) ;;; ;;; Pass 7 - Linking ;;; ; resolve the target labels with the actual target closures (defun link-closures (link-info) (loop for info across link-info for linkfn = (closure-info-linkfn info) when (functionp linkfn) do (funcall linkfn link-info))) ;;;; ;;;; Compiled Matcher structure, and high-level functions ;;;; (defun compile-str (patstr) "Parse a string regex expression, and compile it into matcher object. Uses the pattern cache." ; (format t "~A entries in cache" (hash-table-count *pattern-cache*)) (let ((cached-machine (gethash patstr *pattern-cache*))) (or cached-machine (newly-compiled-str-matcher patstr)))) (defun compile-expr (regexpr) "Parse a string regex expression, and compile it into matcher object. Uses the pattern cache." ; (format t "~A entries in cache" (hash-table-count *pattern-cache*)) (let ((cached-machine (gethash regexpr *pattern-cache*))) (or cached-machine (newly-compiled-expr-matcher regexpr)))) ; Try to use the quickest matcher for the input string. If the ; candidate string isn't a simple string, then match with the slower ; string-matcher. Since this isn't compiled by default, it may need ; to be compiled from the saved expr. (defun match-str-all-parms (matcher candstr regs start length start-is-anchor end-is-anchor acceptfn hooks) (dotimes (i (length regs)) (let ((reg (aref regs i))) (setf (car reg) nil) (setf (cdr reg) nil))) (cond ((simple-string-p candstr) (cond ((functionp (matcher-simple-string-matchfn matcher)) (funcall (matcher-simple-string-matchfn matcher) candstr regs start (+ start length) start-is-anchor end-is-anchor acceptfn hooks)) ((functionp (matcher-string-matchfn matcher)) (funcall (matcher-string-matchfn matcher) candstr regs start (+ start length) start-is-anchor end-is-anchor acceptfn hooks)) (t (error "REGEX Error: ~S is not a valid regex matcher" matcher)))) ((stringp candstr) (cond ((functionp (matcher-string-matchfn matcher)) (funcall (matcher-string-matchfn matcher) candstr regs start (+ start length) start-is-anchor end-is-anchor acceptfn hooks)) ((functionp (matcher-matchexpr matcher)) (let ((*match-simple-strings-only* nil)) (setf (matcher-string-matchfn matcher) (compile-expr-to-matcher (matcher-matchexpr matcher)))) (unless (matcher-string-matchfn matcher) (error "REGEX Error: ~S does not have a valid match function for class STRING" matcher)) (funcall (matcher-string-matchfn matcher) candstr regs start (+ start length) start-is-anchor end-is-anchor acceptfn hooks)))) (t (error "REGEX Error: ~S is not a string" candstr)))) (defun match-str (matcher candstr &key (regs (make-regs (matcher-numregs matcher))) (start 0) (length (- (length candstr) start)) (start-is-anchor (= start 0)) (end-is-anchor (= length (length candstr))) acceptfn hooks) "Run a matcher against a candidate string, without scanning \(so it is implicitly anchored\). Returns \(values t start end regs\) on success, nil on failure." (match-str-all-parms matcher candstr regs start length start-is-anchor end-is-anchor acceptfn hooks)) (define-compiler-macro match-str (matcher candstr &key (regs `(make-regs (matcher-numregs ,matcher))) (start 0) (length `(- (length ,candstr) ,start)) (start-is-anchor `(= ,start 0)) (end-is-anchor `(= ,length (length ,candstr))) acceptfn hooks) `(match-str-all-parms ,matcher ,candstr ,regs ,start ,length ,start-is-anchor ,end-is-anchor ,acceptfn ,hooks)) ;;; This really needs a prefix-map array in the matcher structure so we can ;;; quickly find potential beginnings to the string (defun scan-str-all-parms (matcher str regs start length start-is-anchor end-is-anchor acceptfn hooks) (declare (type matcher matcher) (string str) (fixnum start length)) (let ((matchedp t) match-start (match-start-pos start) (len-remaining length) (match-len length) (match-regs nil)) (loop (multiple-value-setq (matchedp match-start match-len match-regs) (match-str-all-parms matcher str regs match-start-pos len-remaining (and start-is-anchor (= match-start-pos start)) end-is-anchor acceptfn hooks)) (cond (matchedp (return-from scan-str-all-parms (values matchedp match-start match-len match-regs))) ((>= match-start-pos (+ start length)) (return-from scan-str-all-parms nil)) (t (incf match-start-pos) (decf len-remaining)))))) (defun scan-str (matcher candstr &key (regs (make-regs (matcher-numregs matcher))) (start 0) (length (length candstr)) (start-is-anchor (= start 0)) (end-is-anchor (= length (length candstr))) acceptfn hooks) "Run a matcher against a candidate string, scanning forward if necessary. Returns \(values t start end regs\) on success, nil on failure." (scan-str-all-parms matcher candstr regs start length start-is-anchor end-is-anchor acceptfn hooks)) (define-compiler-macro scan-str (matcher candstr &key (regs `(make-regs (matcher-numregs ,matcher))) (start 0) (length `(length ,candstr)) (start-is-anchor `(= ,start 0)) (end-is-anchor `(= ,length (length ,candstr))) acceptfn hooks) `(scan-str-all-parms ,matcher ,candstr ,regs ,start ,length ,start-is-anchor ,end-is-anchor ,acceptfn ,hooks)) (defun uncached-compile-str (patstr) (let ((result (catch 'regex-parse-error (compile-expr-to-matcher (parse-str patstr) patstr)))) (cond ((matcher-p result) result) (t (apply #'format (cons t result)) nil)))) (defun uncached-compile-expr (regexpr &optional str) (let ((result (catch 'regex-parse-error (compile-expr-to-matcher regexpr str)))) (cond ((matcher-p result) result) (t (apply #'format (cons t result)) nil)))) (defun newly-compiled-str-matcher (patstr) (when (>= (hash-table-count *pattern-cache*) +max-regex-str-cache+) (clrhash *pattern-cache*)) (setf (gethash patstr *pattern-cache*) (uncached-compile-str patstr))) (defun newly-compiled-expr-matcher (regexpr) (when (>= (hash-table-count *pattern-cache*) +max-regex-str-cache+) (clrhash *pattern-cache*)) (setf (gethash regexpr *pattern-cache*) (uncached-compile-expr regexpr))) ;;; ;;; Testing ;;; (defun testcomp (str) (compile-expr-to-matcher (parse-str str) str)) (defun testmatch (str pat &key hooks) (clear-pattern-cache) (match-str (compile-str pat) str :hooks hooks))