1+ import re
2+ import copy
3+ from shapely .geometry import LineString
4+
5+ # formats = ['%b %d %Y','%b %d, %Y','%b %d, %Y','%B %d, %Y','%B %d %Y','%d/%m/%Y','%d/%m/%y','%b %Y','%B%Y','%b %d,%Y','%d%b%y','%d%b','%b%y','%d %b %y','%m/%d','%m/%d%y']
6+
7+ class DateParser ():
8+ def __init__ (self ,formats ,start_year = 1900 ,end_year = 2100 ):
9+ self .formats = formats
10+ self .start_year = start_year
11+ self .end_year = end_year
12+ self .values = self .get_values ()
13+ self .patterns = self .get_patterns ()
14+
15+ def get_patterns (self ):
16+ final_patterns = {}
17+ for fmat in self .formats :
18+ patterns = []
19+ present_sub_formats = []
20+ for v in self .values :
21+ if v in fmat :
22+ present_sub_formats .append (v )
23+ i = 0
24+ while (i < len (present_sub_formats )):
25+ v = present_sub_formats [i ]
26+ if len (patterns ) == 0 :
27+ for val in self .values [v ]:
28+ patterns .append (fmat .replace (v ,val ))
29+ else :
30+ new_patterns = []
31+ for p in patterns :
32+ for val in self .values [v ]:
33+ new_patterns .append (p .replace (v ,val ))
34+ patterns = copy .deepcopy (new_patterns )
35+ i += 1
36+ final_patterns [fmat ] = {k :None for k in sorted (patterns )}
37+ return final_patterns
38+
39+
40+ def get_values (self ):
41+ values = {
42+ "%b" : ["jan" ,"feb" ,"mar" ,"apr" ,"may" ,"jun" ,"jul" ,"aug" ,"sep" ,"oct" ,"nov" ,"dec" ],
43+ "%d" : self .get_dates (n_digits = 2 ),
44+ "%-d" : self .get_dates (),
45+ "%Y" : self .get_year_value (start_year = self .start_year ,end_year = self .end_year ),
46+ "%B" : ["january" ,"february" ,"march" ,"april" ,"may" ,"june" ,"july" ,"august" ,"september" ,"october" ,"november" ,"december" ],
47+ "%m" : self .get_months (),
48+ "%-m" : self .get_months (n_digits = 1 ),
49+ "%y" : self .get_year_value (self ,n_digits = 2 )
50+ }
51+ return values
52+
53+ def get_year_value (self ,start_year = 1900 ,end_year = 2100 ,n_digits = 4 ):
54+ if n_digits == 4 :
55+ return [str (d ) for d in range (start_year ,end_year + 1 )]
56+ else :
57+ return ["{:02d}" .format (d ) for d in range (0 ,100 )]
58+
59+ def get_dates (self ,n_digits = 1 ):
60+ if n_digits == 1 :
61+ return [str (d ) for d in range (1 ,32 )]
62+ else :
63+ return ["{:02d}" .format (d ) for d in range (1 ,32 )]
64+
65+ def get_months (self ,n_digits = 2 ):
66+ if n_digits == 2 :
67+ return ["{:02d}" .format (d ) for d in range (1 ,13 )]
68+ else :
69+ return [str (d ) for d in range (1 ,13 )]
70+
71+ def find_repeat_matches (self ,query_string ,sub_string ,pattern ):
72+ qs = copy .deepcopy (query_string )
73+ ret_list = []
74+ while True :
75+ orig_string = copy .deepcopy (qs )
76+ flag = False
77+ if sub_string in orig_string :
78+ ret_list .append ((sub_string ,orig_string .index (sub_string ),orig_string .index (sub_string ) + len (sub_string ),pattern ))
79+ qs = copy .deepcopy (orig_string [:orig_string .index (sub_string )] + " " * len (sub_string ) + orig_string [orig_string .index (sub_string ) + len (sub_string ):])
80+ flag = True
81+ if not flag :
82+ break
83+ return ret_list
84+
85+ def parse_string (self ,query_string ):
86+ query_string = query_string .lower ()
87+ matches = {}
88+ for k in self .patterns :
89+ for v in self .patterns [k ]:
90+ if v in query_string :
91+ if k not in matches :
92+ matches [k ] = []
93+ matches [k ] += self .find_repeat_matches (query_string ,v ,k )
94+ priority_matches = self .priority_matches (matches )
95+ if len (priority_matches .keys ())> 0 :
96+ token_spans = self .get_token_spans (query_string )
97+ match_tokens = self .get_match_tokens (priority_matches ,token_spans )
98+ return match_tokens
99+ return None
100+
101+ def get_match_tokens (self ,priority_matches ,token_spans ):
102+ ret_list = []
103+ for key in priority_matches :
104+ for pm in priority_matches [key ]:
105+ char_start = pm [1 ]
106+ char_end = pm [2 ]
107+ start_token = 1000
108+ end_token = - 1
109+ for idx in token_spans :
110+ ls1 = LineString ([(char_start ,0 ),(char_end ,0 )])
111+ ls2 = LineString ([(token_spans [idx ][1 ],0 ),(token_spans [idx ][2 ],0 )])
112+ if ls1 .intersects (ls2 ):
113+ start_token = min (start_token ,token_spans [idx ][3 ])
114+ end_token = max (end_token ,token_spans [idx ][3 ])
115+ ret_list .append (list (pm ) + [start_token ,end_token ])
116+ ret_list = sorted (ret_list ,key = lambda x :x [2 ]- x [1 ],reverse = True )
117+ final_ret_list = []
118+ for rl in ret_list :
119+ flag = False
120+ for frl in final_ret_list :
121+ if (frl [1 ] <= rl [1 ] and frl [2 ] > rl [2 ]) or (frl [1 ] < rl [1 ] and frl [2 ] >= rl [2 ]):
122+ flag = True
123+ break
124+ if not flag :
125+ final_ret_list .append (rl )
126+ return final_ret_list
127+
128+ def priority_matches (self ,matches ):
129+ unique_found_formats = []
130+ found_formats = matches .keys ()
131+ found_formats = sorted (found_formats ,key = lambda x :len (x ),reverse = True )
132+ for f in found_formats :
133+ flag = False
134+ for uf in unique_found_formats :
135+ if f in uf :
136+ flag = True
137+ break
138+ if not flag :
139+ unique_found_formats .append (f )
140+ return {k :matches [k ] for k in unique_found_formats }
141+
142+ def get_token_spans (self ,query_string ):
143+ query_string = query_string .lower ()
144+ tokens = query_string .split ()
145+ ret_obj = {}
146+ for idx ,k in enumerate (tokens ):
147+ ret_obj [idx ] = (k ,query_string .index (k ),query_string .index (k ) + len (k ),idx )
148+ query_string = query_string [:query_string .index (k )] + " " * len (k ) + query_string [query_string .index (k ) + len (k ):]
149+ return ret_obj
150+
151+ # if __name__ == "__main__":
152+ # dp = DateParser(formats,start_year=2015,end_year=2015)
153+ # query_string = "Dec 01 2015-Dec 31 2015"
154+ # print(dp.parse_string(query_string))
155+
156+ # dp = DateParser(formats,start_year=1900,end_year=2100)
157+ # query_string = "Today is 10/12/16 and tomorrow is 10/12/16."
158+ # print(dp.parse_string(query_string))
0 commit comments