[search] Fixed letter-or-building-part handling.

This commit is contained in:
Yuri Gorshenin 2016-06-07 18:02:17 +03:00
parent eeb41dadb8
commit 47ee541872

View file

@ -105,7 +105,7 @@ exec /usr/bin/env sbcl --noinform --quit --eval "(defparameter *script-name* \"$
'("building" "bldg" "bld" "bl" "unit" "block" "blk"
"корпус" "корп" "литер" "лит" "строение" "блок" "бл"))
(defparameter *house-number-seps* '(#\Space #\. #\( #\) #\# #\~))
(defparameter *house-number-seps* '(#\Space #\Tab #\" #\\ #\( #\) #\. #\# #\~))
(defparameter *house-number-groups-seps* '(#\, #\| #\; #\+))
(defun building-synonym-p (s)
@ -124,21 +124,57 @@ exec /usr/bin/env sbcl --noinform --quit --eval "(defparameter *script-name* \"$
((char= c #\/) :slash)
(T :string)))
(defun transform-string-token (fn value)
"Transforms building token value into one or more tokens in
accordance to it's value. For example, 'литA' is transformed to
tokens 'лит' (building part) and 'А' (letter).
"
(macrolet ((emit (value type) `(funcall fn ,value ,type)))
(cond ((building-synonym-p value)
(emit value :building-part))
((and (= 4 (length value))
(starts-with value "лит"))
(emit (subseq value 0 3) :building-part)
(emit (subseq value 3) :letter))
((and (= 2 (length value))
(short-building-synonym-p (subseq value 0 1)))
(emit (subseq value 0 1) :building-part)
(emit (subseq value 1) :letter))
((= 1 (length value))
(emit value (if (short-building-synonym-p value)
:letter-or-building-part
:letter)))
(T (emit value :string)))))
(defun tokenize-house-number (house-number)
"house-number => [token]"
(let ((parts (group-by #'(lambda (lhs rhs)
(eq (get-char-type lhs) (get-char-type rhs)))
(string-downcase house-number))))
(remove-if #'(lambda (token) (eq :separator (token-type token)))
(mapcar #'(lambda (part) (make-token :value (concatenate 'string part)
:type (get-char-type (car part))))
parts))))
(string-downcase house-number)))
(tokens nil))
(flet ((add-token (value type) (push (make-token :value value :type type) tokens)))
(dolist (part parts)
(let ((value (concatenate 'string part))
(type (get-char-type (car part))))
(case type
(:string (transform-string-token #'add-token value))
(:separator T)
(otherwise (add-token value type)))))
(loop for prev = nil then curr
for curr in tokens
do (when (eq :letter-or-building-part (token-type curr))
(cond ((null prev)
(setf (token-type curr) :letter))
((eq :number (token-type prev))
(setf (token-type curr) :building-part)))))
(reverse tokens))))
(defun house-number-with-optional-suffix-p (tokens)
(case (length tokens)
(1 (eq (token-type (first tokens)) :number))
(2 (and (eq (token-type (first tokens)) :number)
(eq (token-type (second tokens)) :string)))
(or (eq (token-type (second tokens)) :string)
(eq (token-type (second tokens)) :letter))))
(otherwise nil)))
(defun get-house-number-sub-numbers (house-number)
@ -157,48 +193,6 @@ exec /usr/bin/env sbcl --noinform --quit --eval "(defparameter *script-name* \"$
groups
(list tokens))))
(defun parse-house-number (tokens)
"[token] => [token]
Parses house number, but as the grammar is undefined and ambiguous,
the parsing is just a split of some tokens, i.e. 'литА' will be split
to building synonym (литер) and to letter (A).
"
(loop with result = nil
for token in tokens
for token-value = (token-value token)
for token-type = (token-type token)
do (case token-type
(:string (cond ((building-synonym-p token-value)
(push (make-token :value token-value
:type :building-part)
result))
((and (= 4 (length token-value))
(starts-with token-value "лит"))
(push (make-token :value (subseq token-value 0 3)
:type :building-part)
result)
(push (make-token :value (subseq token-value 3)
:type :letter)
result))
((and (= 2 (length token-value))
(short-building-synonym-p (subseq token-value 0 1)))
(push (make-token :value (subseq token-value 0 1)
:type :building-part)
result)
(push (make-token :value (subseq token-value 1)
:type :letter)
result))
((= 1 (length token-value))
(push (make-token :value token-value
:type (if (short-building-synonym-p token-value)
:letter-or-building-part
:letter))
result))
(T (push token result))))
(otherwise (push token result)))
finally (return (reverse result))))
(defun join-house-number-tokens (tokens)
"Joins token values with spaces."
(format nil "~{~a~^ ~}" (mapcar #'token-value tokens)))
@ -228,21 +222,20 @@ exec /usr/bin/env sbcl --noinform --quit --eval "(defparameter *script-name* \"$
(defun get-house-number-pattern (house-number fn)
(dolist (number (get-house-number-sub-numbers house-number))
(let ((house-number (join-house-number-tokens number))
(pattern (join-house-number-parse (parse-house-number number))))
(pattern (join-house-number-parse number)))
(funcall fn house-number pattern))))
(defun get-house-number-prefix-strings (house-number fn)
(dolist (number (get-house-number-sub-numbers house-number))
(dolist (string (mapcar #'token-value
(take-while #'character-token-p (parse-house-number number))))
(take-while #'character-token-p number)))
(funcall fn string string))))
(defun get-house-number-inner-strings (house-number fn)
(dolist (number (get-house-number-sub-numbers house-number))
(dolist (string (mapcar #'token-value
(remove-if-not #'character-token-p
(drop-while #'character-token-p
(parse-house-number number)))))
(drop-while #'character-token-p number))))
(funcall fn string string))))
(defstruct type-settings