Skip to content

Commit 363040d

Browse files
committed
core parser
1 parent 132dbca commit 363040d

1 file changed

Lines changed: 158 additions & 0 deletions

File tree

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
import re
2+
import copy
3+
from shapely.geometry import LineString
4+
5+
# formats = ['%b %d %Y','%b %d, %Y','%b %d, %Y','%B %d, %Y','%B %d %Y','%d/%m/%Y','%d/%m/%y','%b %Y','%B%Y','%b %d,%Y','%d%b%y','%d%b','%b%y','%d %b %y','%m/%d','%m/%d%y']
6+
7+
class DateParser():
8+
def __init__(self,formats,start_year=1900,end_year=2100):
9+
self.formats = formats
10+
self.start_year = start_year
11+
self.end_year = end_year
12+
self.values = self.get_values()
13+
self.patterns = self.get_patterns()
14+
15+
def get_patterns(self):
16+
final_patterns = {}
17+
for fmat in self.formats:
18+
patterns = []
19+
present_sub_formats = []
20+
for v in self.values:
21+
if v in fmat:
22+
present_sub_formats.append(v)
23+
i = 0
24+
while(i<len(present_sub_formats)):
25+
v = present_sub_formats[i]
26+
if len(patterns) == 0:
27+
for val in self.values[v]:
28+
patterns.append(fmat.replace(v,val))
29+
else:
30+
new_patterns = []
31+
for p in patterns:
32+
for val in self.values[v]:
33+
new_patterns.append(p.replace(v,val))
34+
patterns = copy.deepcopy(new_patterns)
35+
i += 1
36+
final_patterns[fmat] = {k:None for k in sorted(patterns)}
37+
return final_patterns
38+
39+
40+
def get_values(self):
41+
values = {
42+
"%b": ["jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec"],
43+
"%d": self.get_dates(n_digits=2),
44+
"%-d": self.get_dates(),
45+
"%Y": self.get_year_value(start_year=self.start_year,end_year=self.end_year),
46+
"%B": ["january","february","march","april","may","june","july","august","september","october","november","december"],
47+
"%m": self.get_months(),
48+
"%-m": self.get_months(n_digits=1),
49+
"%y": self.get_year_value(self,n_digits=2)
50+
}
51+
return values
52+
53+
def get_year_value(self,start_year=1900,end_year=2100,n_digits=4):
54+
if n_digits==4:
55+
return [str(d) for d in range(start_year,end_year+1)]
56+
else:
57+
return ["{:02d}".format(d) for d in range(0,100)]
58+
59+
def get_dates(self,n_digits=1):
60+
if n_digits==1:
61+
return [str(d) for d in range(1,32)]
62+
else:
63+
return ["{:02d}".format(d) for d in range(1,32)]
64+
65+
def get_months(self,n_digits=2):
66+
if n_digits==2:
67+
return ["{:02d}".format(d) for d in range(1,13)]
68+
else:
69+
return [str(d) for d in range(1,13)]
70+
71+
def find_repeat_matches(self,query_string,sub_string,pattern):
72+
qs = copy.deepcopy(query_string)
73+
ret_list = []
74+
while True:
75+
orig_string = copy.deepcopy(qs)
76+
flag = False
77+
if sub_string in orig_string:
78+
ret_list.append((sub_string,orig_string.index(sub_string),orig_string.index(sub_string) + len(sub_string),pattern))
79+
qs = copy.deepcopy(orig_string[:orig_string.index(sub_string)] + " "*len(sub_string) + orig_string[orig_string.index(sub_string) + len(sub_string):])
80+
flag = True
81+
if not flag:
82+
break
83+
return ret_list
84+
85+
def parse_string(self,query_string):
86+
query_string = query_string.lower()
87+
matches = {}
88+
for k in self.patterns:
89+
for v in self.patterns[k]:
90+
if v in query_string:
91+
if k not in matches:
92+
matches[k] = []
93+
matches[k] += self.find_repeat_matches(query_string,v,k)
94+
priority_matches = self.priority_matches(matches)
95+
if len(priority_matches.keys())>0:
96+
token_spans = self.get_token_spans(query_string)
97+
match_tokens = self.get_match_tokens(priority_matches,token_spans)
98+
return match_tokens
99+
return None
100+
101+
def get_match_tokens(self,priority_matches,token_spans):
102+
ret_list = []
103+
for key in priority_matches:
104+
for pm in priority_matches[key]:
105+
char_start = pm[1]
106+
char_end = pm[2]
107+
start_token = 1000
108+
end_token = -1
109+
for idx in token_spans:
110+
ls1 = LineString([(char_start,0),(char_end,0)])
111+
ls2 = LineString([(token_spans[idx][1],0),(token_spans[idx][2],0)])
112+
if ls1.intersects(ls2):
113+
start_token = min(start_token,token_spans[idx][3])
114+
end_token = max(end_token,token_spans[idx][3])
115+
ret_list.append(list(pm) + [start_token,end_token])
116+
ret_list = sorted(ret_list,key=lambda x:x[2]-x[1],reverse=True)
117+
final_ret_list = []
118+
for rl in ret_list:
119+
flag = False
120+
for frl in final_ret_list:
121+
if (frl[1] <= rl[1] and frl[2] > rl[2]) or (frl[1] < rl[1] and frl[2] >= rl[2]):
122+
flag = True
123+
break
124+
if not flag:
125+
final_ret_list.append(rl)
126+
return final_ret_list
127+
128+
def priority_matches(self,matches):
129+
unique_found_formats = []
130+
found_formats = matches.keys()
131+
found_formats = sorted(found_formats,key=lambda x:len(x),reverse=True)
132+
for f in found_formats:
133+
flag = False
134+
for uf in unique_found_formats:
135+
if f in uf:
136+
flag = True
137+
break
138+
if not flag:
139+
unique_found_formats.append(f)
140+
return {k:matches[k] for k in unique_found_formats}
141+
142+
def get_token_spans(self,query_string):
143+
query_string = query_string.lower()
144+
tokens = query_string.split()
145+
ret_obj = {}
146+
for idx,k in enumerate(tokens):
147+
ret_obj[idx] = (k,query_string.index(k),query_string.index(k) + len(k),idx)
148+
query_string = query_string[:query_string.index(k)] + " "*len(k) + query_string[query_string.index(k) + len(k):]
149+
return ret_obj
150+
151+
# if __name__ == "__main__":
152+
# dp = DateParser(formats,start_year=2015,end_year=2015)
153+
# query_string = "Dec 01 2015-Dec 31 2015"
154+
# print(dp.parse_string(query_string))
155+
156+
# dp = DateParser(formats,start_year=1900,end_year=2100)
157+
# query_string = "Today is 10/12/16 and tomorrow is 10/12/16."
158+
# print(dp.parse_string(query_string))

0 commit comments

Comments
 (0)