|
| 1 | +import re |
| 2 | +from chinese_num_conf import UNIT_CN2AN |
| 3 | +all_num = "零一二三四五六七八九" |
| 4 | +all_unit = "".join(list(UNIT_CN2AN.keys())) |
| 5 | +cn_pattern = f"负?([{all_num}{all_unit}]+点)?[{all_num}{all_unit}]+" |
| 6 | +smart_cn_pattern = f"-?([0-9]+.)?[0-9]+[{all_unit}]+" |
| 7 | +def chinese_to_arabic(chinese_num): |
| 8 | + chinese_char_dict = { |
| 9 | + '〇' : 0, '一' : 1, '二' : 2, '三' : 3, '四' : 4, '五' : 5, '六' : 6, '七' : 7, '八' : 8, '九' : 9, '零' : 0, |
| 10 | + '壹' : 1, '贰' : 2, '叁' : 3, '肆' : 4, '伍' : 5, '陆' : 6, '柒' : 7, '捌' : 8, '玖' : 9, '貮' : 2, '兩' : 2, |
| 11 | + } |
| 12 | + chinese_char_unit = { |
| 13 | + '十' : 10, |
| 14 | + '拾' : 10, |
| 15 | + '百' : 100, |
| 16 | + '佰' : 100, |
| 17 | + '千' : 1000, |
| 18 | + '仟' : 1000, |
| 19 | + '万' : 10000, |
| 20 | + '萬' : 10000, |
| 21 | + '亿' : 100000000, |
| 22 | + '億' : 100000000, |
| 23 | + '兆' : 1000000000000, |
| 24 | + } |
| 25 | + unit = 0 |
| 26 | + ldig = [] |
| 27 | + # print(chinese_num) |
| 28 | + for char_digit in reversed(chinese_num): |
| 29 | + # print(char_digit) |
| 30 | + if chinese_char_unit.get(char_digit) != None: |
| 31 | + unit = chinese_char_unit.get(char_digit) |
| 32 | + # print("unit:"+str(unit)) |
| 33 | + if unit == 10000 or unit == 100000000: |
| 34 | + ldig.append(unit) |
| 35 | + unit = 1 |
| 36 | + else: |
| 37 | + dig = chinese_char_dict.get(char_digit) |
| 38 | + # print("num:"+str(dig)) |
| 39 | + if unit: |
| 40 | + dig *= unit |
| 41 | + unit = 0 |
| 42 | + ldig.append(dig) |
| 43 | + if unit == 10: |
| 44 | + ldig.append(10) |
| 45 | + val, tmp = 0, 0 |
| 46 | + for x in reversed(ldig): |
| 47 | + if x == 10000 or x == 100000000: |
| 48 | + val += tmp * x |
| 49 | + tmp = 0 |
| 50 | + else: |
| 51 | + tmp += x |
| 52 | + val += tmp |
| 53 | + return val |
| 54 | +def __sub_util(inputs, sub_mode: str = "number") -> str: |
| 55 | + try: |
| 56 | + if inputs: |
| 57 | + if sub_mode == "date": |
| 58 | + return re.sub(fr"(({smart_cn_pattern})|({cn_pattern}))", |
| 59 | + lambda x: str(chinese_to_arabic(x.group())), inputs) |
| 60 | + elif sub_mode == "fraction": |
| 61 | + if inputs[0] != "百": |
| 62 | + frac_result = re.sub(cn_pattern, |
| 63 | + lambda x: str(chinese_to_arabic(x.group())), inputs) |
| 64 | + numerator, denominator = frac_result.split("分之") |
| 65 | + return f"{denominator}/{numerator}" |
| 66 | + else: |
| 67 | + return inputs |
| 68 | + elif sub_mode == "percent": |
| 69 | + return re.sub(f"(?<=百分之){cn_pattern}", |
| 70 | + lambda x: str(chinese_to_arabic(x.group())), inputs).replace("百分之", "") + "%" |
| 71 | + elif sub_mode == "celsius": |
| 72 | + return re.sub(f"{cn_pattern}(?=攝氏度)", |
| 73 | + lambda x: str(chinese_to_arabic(x.group())), inputs).replace("攝氏度", "℃") |
| 74 | + elif sub_mode == "number": |
| 75 | + return str(chinese_to_arabic(inputs)) |
| 76 | + else: |
| 77 | + raise Exception(f"error sub_mode: {sub_mode} !") |
| 78 | + except Exception as e: |
| 79 | + print(f"WARN: {e}") |
| 80 | + return inputs |
| 81 | +def chinese_in_string_transform(inputs: str) -> str: |
| 82 | + inputs = inputs.replace("廿", "二十").replace("半", "0.5").replace("两", "2") |
| 83 | + # date |
| 84 | + inputs = re.sub( |
| 85 | + fr"((({smart_cn_pattern})|({cn_pattern}))年)?([{all_num}十]+月)?([{all_num}十]+日)?", |
| 86 | + lambda x: __sub_util(x.group(),"date"), inputs) |
| 87 | + # fraction |
| 88 | + inputs = re.sub(fr"{cn_pattern}分之{cn_pattern}", |
| 89 | + lambda x: __sub_util(x.group(),"fraction"), inputs) |
| 90 | + # percent |
| 91 | + inputs = re.sub(fr"百分之{cn_pattern}", |
| 92 | + lambda x: __sub_util(x.group(),"percent"), inputs) |
| 93 | + # celsius |
| 94 | + inputs = re.sub(fr"{cn_pattern}攝氏度", |
| 95 | + lambda x: __sub_util(x.group(),"celsius"), inputs) |
| 96 | + # number |
| 97 | + output = re.sub(cn_pattern, |
| 98 | + lambda x: __sub_util(x.group(),"number"), inputs) |
| 99 | + return output |
0 commit comments