--- chasen-2.2.8/lib/tokenizer.c Thu Jun 21 14:37:36 2001 +++ chasen-2.2.8-nmz/lib/tokenizer.c Tue Jul 24 20:22:53 2001 @@ -53,7 +53,9 @@ KATAKANA, /* KATAKANA LETTER (SMALL) [A-KE] */ SMALL_KATAKANA, /* KATAKANA LETTER SMALL AIUEO, TU, YAYUYO, WA */ FULL_LATIN, /* FULLWIDTH LATIN (CAPITAL|SMALL) LETTER [A-Z] */ - HALF_LATIN, /* LATIN (CAPITAL|SMALL) LETTER [A-Z] */ + HALF_LATIN, /* HALFWIDTH LATIN (CAPITAL|SMALL) LETTER [A-Z] */ + HALF_DIGIT, /* HALFWIDTH DIGIT [0-9] */ + HALF_PUNCT, /* HALFWIDTH PUNCTUATION */ JA_OTHER, }; @@ -318,9 +320,12 @@ { if (state == JA_SPACE) { tok->_anno_type[cursor] = -1; - } else if ((state == HALF_LATIN) || - (state == FULL_LATIN)) { - ; /* do nothing */ + } else if (state == HALF_LATIN) { + ; + } else if (state == HALF_DIGIT) { + state = HALF_LATIN; + } else if (state == HALF_PUNCT) { + state = HALF_LATIN; } else if (((*state0 == KATAKANA) && ((state == PROLONGED) || (state == SMALL_KATAKANA))) || @@ -356,7 +361,11 @@ return HALF_LATIN; } else if (is_space(str[0])) { return JA_SPACE; - } + } else if (isdigit(str[0])) { + return HALF_DIGIT; + } else if (ispunct(str[0])) { + return HALF_PUNCT; + } } else if (mblen == 2) { if ((str[0] == 0xa1) && (str[1] == 0xbc)) { return PROLONGED; @@ -388,7 +397,11 @@ return HALF_LATIN; } else if (is_space(str[0])) { return JA_SPACE; - } + } else if (isdigit(str[0])) { + return HALF_DIGIT; + } else if (ispunct(str[0])) { + return HALF_PUNCT; + } } else if (mblen == 3) { if ((str[0] == 0xe3) && (str[1] == 0x83) && (str[2] == 0xbc)) { return PROLONGED;