44
55# formats = ['%b %d %Y','%b %d, %Y','%b %d, %Y','%B %d, %Y','%B %d %Y','%d/%m/%Y','%d/%m/%y','%b %Y','%B%Y','%b %d,%Y','%d%b%y','%d%b','%b%y','%d %b %y','%m/%d','%m/%d%y']
66
7+
78class DateParser ():
8- def __init__ (self ,formats ,start_year = 1900 ,end_year = 2100 ):
9+ def __init__ (self , formats , start_year = 1900 , end_year = 2100 ):
910 self .formats = formats
1011 self .start_year = start_year
1112 self .end_year = end_year
1213 self .values = self .get_values ()
1314 self .patterns = self .get_patterns ()
14-
15+
1516 def get_patterns (self ):
1617 final_patterns = {}
1718 for fmat in self .formats :
@@ -21,84 +22,85 @@ def get_patterns(self):
2122 if v in fmat :
2223 present_sub_formats .append (v )
2324 i = 0
24- while (i < len (present_sub_formats )):
25+ while (i < len (present_sub_formats )):
2526 v = present_sub_formats [i ]
2627 if len (patterns ) == 0 :
2728 for val in self .values [v ]:
28- patterns .append (fmat .replace (v ,val ))
29+ patterns .append (fmat .replace (v , val ))
2930 else :
3031 new_patterns = []
3132 for p in patterns :
3233 for val in self .values [v ]:
33- new_patterns .append (p .replace (v ,val ))
34+ new_patterns .append (p .replace (v , val ))
3435 patterns = copy .deepcopy (new_patterns )
3536 i += 1
36- final_patterns [fmat ] = {k :None for k in sorted (patterns )}
37+ final_patterns [fmat ] = {k : None for k in sorted (patterns )}
3738 return final_patterns
38-
39-
39+
4040 def get_values (self ):
4141 values = {
42- "%b" : ["jan" ,"feb" ,"mar" ,"apr" ,"may" ,"jun" ,"jul" ,"aug" ,"sep" ,"oct" ,"nov" ,"dec" ],
42+ "%b" : ["jan" , "feb" , "mar" , "apr" , "may" , "jun" , "jul" , "aug" , "sep" , "oct" , "nov" , "dec" ],
4343 "%d" : self .get_dates (n_digits = 2 ),
4444 "%-d" : self .get_dates (),
45- "%Y" : self .get_year_value (start_year = self .start_year ,end_year = self .end_year ),
46- "%B" : ["january" ,"february" ,"march" ,"april" ,"may" ,"june" ,"july" ,"august" ,"september" ,"october" ,"november" ,"december" ],
45+ "%Y" : self .get_year_value (start_year = self .start_year , end_year = self .end_year ),
46+ "%B" : ["january" , "february" , "march" , "april" , "may" , "june" , "july" , "august" , "september" , "october" , "november" , "december" ],
4747 "%m" : self .get_months (),
4848 "%-m" : self .get_months (n_digits = 1 ),
49- "%y" : self .get_year_value (self ,n_digits = 2 )
49+ "%y" : self .get_year_value (self , n_digits = 2 )
5050 }
5151 return values
5252
53- def get_year_value (self ,start_year = 1900 ,end_year = 2100 ,n_digits = 4 ):
54- if n_digits == 4 :
55- return [str (d ) for d in range (start_year ,end_year + 1 )]
53+ def get_year_value (self , start_year = 1900 , end_year = 2100 , n_digits = 4 ):
54+ if n_digits == 4 :
55+ return [str (d ) for d in range (start_year , end_year + 1 )]
5656 else :
57- return ["{:02d}" .format (d ) for d in range (0 ,100 )]
57+ return ["{:02d}" .format (d ) for d in range (0 , 100 )]
5858
59- def get_dates (self ,n_digits = 1 ):
60- if n_digits == 1 :
61- return [str (d ) for d in range (1 ,32 )]
59+ def get_dates (self , n_digits = 1 ):
60+ if n_digits == 1 :
61+ return [str (d ) for d in range (1 , 32 )]
6262 else :
63- return ["{:02d}" .format (d ) for d in range (1 ,32 )]
63+ return ["{:02d}" .format (d ) for d in range (1 , 32 )]
6464
65- def get_months (self ,n_digits = 2 ):
66- if n_digits == 2 :
67- return ["{:02d}" .format (d ) for d in range (1 ,13 )]
65+ def get_months (self , n_digits = 2 ):
66+ if n_digits == 2 :
67+ return ["{:02d}" .format (d ) for d in range (1 , 13 )]
6868 else :
69- return [str (d ) for d in range (1 ,13 )]
70-
71- def find_repeat_matches (self ,query_string ,sub_string ,pattern ):
69+ return [str (d ) for d in range (1 , 13 )]
70+
71+ def find_repeat_matches (self , query_string , sub_string , pattern ):
7272 qs = copy .deepcopy (query_string )
7373 ret_list = []
7474 while True :
7575 orig_string = copy .deepcopy (qs )
7676 flag = False
7777 if sub_string in orig_string :
78- ret_list .append ((sub_string ,orig_string .index (sub_string ),orig_string .index (sub_string ) + len (sub_string ),pattern ))
79- qs = copy .deepcopy (orig_string [:orig_string .index (sub_string )] + " " * len (sub_string ) + orig_string [orig_string .index (sub_string ) + len (sub_string ):])
78+ ret_list .append ((sub_string , orig_string .index (
79+ sub_string ), orig_string .index (sub_string ) + len (sub_string ), pattern ))
80+ qs = copy .deepcopy (orig_string [:orig_string .index (sub_string )] + " " * len (
81+ sub_string ) + orig_string [orig_string .index (sub_string ) + len (sub_string ):])
8082 flag = True
8183 if not flag :
8284 break
8385 return ret_list
8486
85- def parse_string (self ,query_string ):
87+ def parse_string (self , query_string ):
8688 query_string = query_string .lower ()
8789 matches = {}
8890 for k in self .patterns :
8991 for v in self .patterns [k ]:
9092 if v in query_string :
9193 if k not in matches :
9294 matches [k ] = []
93- matches [k ] += self .find_repeat_matches (query_string ,v , k )
95+ matches [k ] += self .find_repeat_matches (query_string , v , k )
9496 priority_matches = self .priority_matches (matches )
95- if len (priority_matches .keys ())> 0 :
97+ if len (priority_matches .keys ()) > 0 :
9698 token_spans = self .get_token_spans (query_string )
97- match_tokens = self .get_match_tokens (priority_matches ,token_spans )
99+ match_tokens = self .get_match_tokens (priority_matches , token_spans )
98100 return match_tokens
99101 return None
100102
101- def get_match_tokens (self ,priority_matches ,token_spans ):
103+ def get_match_tokens (self , priority_matches , token_spans ):
102104 ret_list = []
103105 for key in priority_matches :
104106 for pm in priority_matches [key ]:
@@ -107,13 +109,14 @@ def get_match_tokens(self,priority_matches,token_spans):
107109 start_token = 1000
108110 end_token = - 1
109111 for idx in token_spans :
110- ls1 = LineString ([(char_start ,0 ),(char_end ,0 )])
111- ls2 = LineString ([(token_spans [idx ][1 ],0 ),(token_spans [idx ][2 ],0 )])
112+ ls1 = LineString ([(char_start , 0 ), (char_end , 0 )])
113+ ls2 = LineString (
114+ [(token_spans [idx ][1 ], 0 ), (token_spans [idx ][2 ], 0 )])
112115 if ls1 .intersects (ls2 ):
113- start_token = min (start_token ,token_spans [idx ][3 ])
114- end_token = max (end_token ,token_spans [idx ][3 ])
115- ret_list .append (list (pm ) + [start_token ,end_token ])
116- ret_list = sorted (ret_list ,key = lambda x :x [2 ]- x [1 ],reverse = True )
116+ start_token = min (start_token , token_spans [idx ][3 ])
117+ end_token = max (end_token , token_spans [idx ][3 ])
118+ ret_list .append (list (pm ) + [start_token , end_token ])
119+ ret_list = sorted (ret_list , key = lambda x : x [2 ]- x [1 ], reverse = True )
117120 final_ret_list = []
118121 for rl in ret_list :
119122 flag = False
@@ -125,10 +128,11 @@ def get_match_tokens(self,priority_matches,token_spans):
125128 final_ret_list .append (rl )
126129 return final_ret_list
127130
128- def priority_matches (self ,matches ):
131+ def priority_matches (self , matches ):
129132 unique_found_formats = []
130133 found_formats = matches .keys ()
131- found_formats = sorted (found_formats ,key = lambda x :len (x ),reverse = True )
134+ found_formats = sorted (
135+ found_formats , key = lambda x : len (x ), reverse = True )
132136 for f in found_formats :
133137 flag = False
134138 for uf in unique_found_formats :
@@ -137,22 +141,15 @@ def priority_matches(self,matches):
137141 break
138142 if not flag :
139143 unique_found_formats .append (f )
140- return {k :matches [k ] for k in unique_found_formats }
144+ return {k : matches [k ] for k in unique_found_formats }
141145
142- def get_token_spans (self ,query_string ):
146+ def get_token_spans (self , query_string ):
143147 query_string = query_string .lower ()
144148 tokens = query_string .split ()
145149 ret_obj = {}
146- for idx ,k in enumerate (tokens ):
147- ret_obj [idx ] = (k ,query_string .index (k ),query_string .index (k ) + len (k ),idx )
148- query_string = query_string [:query_string .index (k )] + " " * len (k ) + query_string [query_string .index (k ) + len (k ):]
150+ for idx , k in enumerate (tokens ):
151+ ret_obj [idx ] = (k , query_string .index (
152+ k ), query_string .index (k ) + len (k ), idx )
153+ query_string = query_string [:query_string .index (
154+ k )] + " " * len (k ) + query_string [query_string .index (k ) + len (k ):]
149155 return ret_obj
150-
151- # if __name__ == "__main__":
152- # dp = DateParser(formats,start_year=2015,end_year=2015)
153- # query_string = "Dec 01 2015-Dec 31 2015"
154- # print(dp.parse_string(query_string))
155-
156- # dp = DateParser(formats,start_year=1900,end_year=2100)
157- # query_string = "Today is 10/12/16 and tomorrow is 10/12/16."
158- # print(dp.parse_string(query_string))
0 commit comments