Python dataframe compare rows and get min and max values [closed]
up vote
0
down vote
favorite
I need to do row wise comparison. When certain condition satisfies then I want to perform the group by.
Currently I have the code to do row wise comparison using while loop. I would like to avoid loops and looking for a way at the dataframe level.
The condition for group by: For a MID if the Row1 ToDate is equal to Row2 FromDate or Row1 ToDate plus one day is equal to Row2 Fromdate - then those two rows needs to be concatenated taking the Row1Todate as Fromdate and Row2ToDate as Todate.
Input, Output, steps
>import pandas as pd
>import numpy as np
>import os
>import os.path
>from datetime import datetime, timedelta
>
>dfmfile = pd.DataFrame({'MID':['1000','1000','1001','1001','1002','1002','1003','1003','1004','1004','1005','1006','1007','1007','1007','1007','1008','1008'],'FromDate':['2017-06-12','2017-06-17','2017-04-16','2017-04-18','2017-10-13','2017-10-18','2017-05-21','2017-05-24','2017-02-11','2017-02-11','2017-06-13','2017-03-07','2016-07-05','2017-04-11','2017-04-14','2017-04-18','2015-05-18','2016-05-23'],'ToDate':['2017-06-17','2017-06-22','2017-04-18','2017-04-22','2017-10-17','2017-10-22','2017-05-23','2017-05-27','2017-02-15','2017-02-11','2017-06-18','2017-03-09','2016-07-05','2017-04-13','2017-04-17','2017-04-22','2015-05-22','2016-05-23'],'CH':['1000','1000','1001','1001','1002','1003','1004','1005','1006','1007','1008','1009','1010','1011','1012','1013','1014','1014'],'Ln':['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18']})
>dfmfile.index.name='index'
>dfmfile['CID']= dfmfile['CH']+dfmfile['Ln'].fillna('')
>
>dfmfile['MID'] = dfmfile['MID']
>dfmfile['FromDate'] = pd.to_datetime(dfmfile['FromDate'])
>dfmfile['ToDate'] = pd.to_datetime(dfmfile['ToDate'])
>
>print dfmfile
>dfmsort= dfmfile.sort_values(['MID', 'FromDate','ToDate'], ascending=[True, True, True])
>
>dfmsort['MID'] = dfmsort['MID']
>dfmsort['FromDate'] = pd.to_datetime(dfmsort['FromDate'])
>dfmsort['ToDate'] = pd.to_datetime(dfmsort['ToDate'])
>
>dfmsort3=dfmsort.reset_index()
>print dfmsort
>dfunique = pd.DataFrame(data=None,columns=['MID'])
>dfunique['MID']= dfmsort3['MID'].unique()
>#dfunique['MID'] = pd.DataFrame(dfmsort3['MID'].unique(), dtype='str')
>
>dfc = pd.DataFrame(data=None,columns=['MID','AD','DD','CID'])
>
>
>print dfunique
>for row in dfunique.itertuples():
> dfcm = dfmsort3.loc[dfmsort3['MID'] == row.MID].reset_index()
>
> i = dfcm.index.min()
> minindx = i
> maxindx =dfcm.index.max()
>
> MbrId = row.MIDI want to compare Row1.ToDAte = Row2.FromDate or Row2.FromDate-1. Only when that condition satisfies I want to do the group by by ID. The number of rows to compare could be more than 2,3,4,5,6 or n. So simple group by does not work. I want to avoid iterating through rows.
1.Sort the dataframe. Convert string to Dates
2.For a MID: Compare Row1_ToDate with Row2_FromDate
2.1.Initialize: AD = Row1_ToDate
3.IfRow1.ToDate == Row2_FromDate OR Row1_ToDate+1Day == Row2_FromDate
3.1.DD= Row2_ToDate and concatenate CID+Ln and merge these values from
row1 and Row2
4.if Row1_ToDate != Row2_FromDate AND Row1_ToDate+1Day != Row2_FromDate
4.1.DD= Row1_ToDate
[Input and Output][1]
[1]: https://i.stack.imgur.com/fFbHd.png
`
>import pandas as pd
>import numpy as np
>import os
>import os.path
>from datetime import datetime, timedelta
>
>dfmfile = pd.DataFrame({'MID':
>['1000','1000','1001','1001','1002','1002','1003','1003','1004','1004',
> '1005','1006','1007','1007','1007','1007','1008','1008'], 'FromDate':
>['2017-06-12','2017-06-17','2017-04-16','2017-04-18','2017-10-13',
>'2017-10-18','2017-05-21','2017-05-24','2017-02-11','2017-02-11',
>'2017-06-13','2017-03-07','2016-07-05','2017-04-11','2017-04-14',
>'2017-04-18','2015-05-18','2016-05-23'],'ToDate':['2017-06-17',
>'2017-06-22','2017-04-18','2017-04-22','2017-10-17','2017-10-22',
>'2017-05-23','2017-05-27','2017-02-15','2017-02-11','2017-06-18',
>'2017-03-09','2016-07-05','2017-04-13','2017-04-17','2017-04-22',
>'2015-05-22','2016-05-23'],'CH':
>['1000','1000','1001','1001','1002','1003','1004','1005','1006', '1007',
>'1008','1009','1010','1011','1012','1013','1014','1014'],'Ln':
>['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16',
>'17','18']})
>dfmfile.index.name='index'
>dfmfile['CID']= dfmfile['CH']+dfmfile['Ln'].fillna('')
>
>dfmfile['MID'] = dfmfile['MID']
>dfmfile['FromDate'] = pd.to_datetime(dfmfile['FromDate'])
>dfmfile['ToDate'] = pd.to_datetime(dfmfile['ToDate'])
>
>print dfmfile
>dfmsort= dfmfile.sort_values(['MID', 'FromDate','ToDate'], ascending=
>[True,
>True, True])
>
>dfmsort['MID'] = dfmsort['MID']
>dfmsort['FromDate'] = pd.to_datetime(dfmsort['FromDate'])
>dfmsort['ToDate'] = pd.to_datetime(dfmsort['ToDate'])
>
>dfmsort3=dfmsort.reset_index()
>print dfmsort
>dfunique = pd.DataFrame(data=None,columns=['MID'])
>dfunique['MID']= dfmsort3['MID'].unique()
>
>dfc = pd.DataFrame(data=None,columns=['MID','AD','DD','CID'])
>
>
>print dfunique
>for row in dfunique.itertuples():
> dfcm = dfmsort3.loc[dfmsort3['MID'] == row.MID].reset_index()
>
> i = dfcm.index.min()
> minindx = i
> maxindx =dfcm.index.max()
>
> MbrId = row.MID
> CID=''
> AdmitDate =''
> DischargeDate=''
> while (minindx <= maxindx):
> priorrow = dfcm.loc[minindx]
>
> if minindx <= maxindx:
> if (AdmitDate == ''):
> AdmitDate = priorrow.FromDate
>
> if minindx+1 <= maxindx:
> nextrow=dfcm.loc[minindx+1]
> if ((priorrow.ToDate + timedelta(days=1) == nextrow.FromDate)|
>(priorrow.ToDate == nextrow.FromDate)):
>
> DischargeDate = nextrow.ToDate
> if CID == '':
> if priorrow.CID not in CID:
> CID= priorrow.CID
> if nextrow.CID not in CID:
> CID= nextrow.CID
> if CID != '':
> if priorrow.CID not in CID:
> CID= (CID+';'+priorrow.CID)
> if nextrow.CID not in CID:
> CID= (CID+';'+nextrow.CID)
>
> if(((priorrow.ToDate + timedelta(days=1) != nextrow.FromDate) &
>(priorrow.ToDate != nextrow.FromDate))|(minindx == maxindx)):
>
> DischargeDate = priorrow.ToDate
> if ((CID == '') & (priorrow.CID not in CID)):
> CID= priorrow.CID
> if ((CID != '') & (priorrow.CID not in CID)):
> CID= (CID+';'+priorrow.CID)
>
> dfc=pd.DataFrame(np.array([[MbrId, AdmitDate, DischargeDate,
> CID]]), columns=['MID','AD','DD','CID']).append(dfc, ignore_index=True)
>
> CID=''
> AdmitDate =''
> DischargeDate=''
>
> minindx = minindx + 1
>
>print dfc
>
> CID=''
> AdmitDate =''
> DischargeDate=''
> while (minindx <= maxindx):
> priorrow = dfcm.loc[minindx]
>
> if minindx <= maxindx:
> if (AdmitDate == ''):
> AdmitDate = priorrow.FromDate
>
> if minindx+1 <= maxindx:
> nextrow=dfcm.loc[minindx+1]
> if ((priorrow.ToDate + timedelta(days=1) == nextrow.FromDate)|
>(priorrow.ToDate == nextrow.FromDate)):
>
> DischargeDate = nextrow.ToDate
> if CID == '':
> if priorrow.CID not in CID:
> CID= priorrow.CID
> if nextrow.CID not in CID:
> CID= nextrow.CID
> if CID != '':
> if priorrow.CID not in CID:
> CID= (CID+';'+priorrow.CID)
> if nextrow.CID not in CID:
> CID= (CID+';'+nextrow.CID)
>
> if(((priorrow.ToDate + timedelta(days=1) != nextrow.FromDate) &
>(priorrow.ToDate != nextrow.FromDate))|(minindx == maxindx)):
>
> DischargeDate = priorrow.ToDate
> if ((CID == '') & (priorrow.CID not in CID)):
> CID= priorrow.CID
> if ((CID != '') & (priorrow.CID not in CID)):
> CID= (CID+';'+priorrow.CID)
>
> dfc=pd.DataFrame(np.array([[MbrId, AdmitDate, DischargeDate,
>CID]]), columns=['MID','AD','DD','CID']).append(dfc, ignore_index=True)
>
> CID=''
> AdmitDate =''
> DischargeDate=''
>
> minindx = minindx + 1
>
>print dfc
python pandas
New contributor
closed as off-topic by timgeb, TheIncorrigible1, jpp, Daniel Mesejo, TDG Nov 10 at 15:57
This question appears to be off-topic. The users who voted to close gave this specific reason:
- "Questions seeking debugging help ("why isn't this code working?") must include the desired behavior, a specific problem or error and the shortest code necessary to reproduce it in the question itself. Questions without a clear problem statement are not useful to other readers. See: How to create a Minimal, Complete, and Verifiable example." – timgeb, TheIncorrigible1, jpp, Daniel Mesejo, TDG
If this question can be reworded to fit the rules in the help center, please edit the question.
add a comment |
up vote
0
down vote
favorite
I need to do row wise comparison. When certain condition satisfies then I want to perform the group by.
Currently I have the code to do row wise comparison using while loop. I would like to avoid loops and looking for a way at the dataframe level.
The condition for group by: For a MID if the Row1 ToDate is equal to Row2 FromDate or Row1 ToDate plus one day is equal to Row2 Fromdate - then those two rows needs to be concatenated taking the Row1Todate as Fromdate and Row2ToDate as Todate.
Input, Output, steps
>import pandas as pd
>import numpy as np
>import os
>import os.path
>from datetime import datetime, timedelta
>
>dfmfile = pd.DataFrame({'MID':['1000','1000','1001','1001','1002','1002','1003','1003','1004','1004','1005','1006','1007','1007','1007','1007','1008','1008'],'FromDate':['2017-06-12','2017-06-17','2017-04-16','2017-04-18','2017-10-13','2017-10-18','2017-05-21','2017-05-24','2017-02-11','2017-02-11','2017-06-13','2017-03-07','2016-07-05','2017-04-11','2017-04-14','2017-04-18','2015-05-18','2016-05-23'],'ToDate':['2017-06-17','2017-06-22','2017-04-18','2017-04-22','2017-10-17','2017-10-22','2017-05-23','2017-05-27','2017-02-15','2017-02-11','2017-06-18','2017-03-09','2016-07-05','2017-04-13','2017-04-17','2017-04-22','2015-05-22','2016-05-23'],'CH':['1000','1000','1001','1001','1002','1003','1004','1005','1006','1007','1008','1009','1010','1011','1012','1013','1014','1014'],'Ln':['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18']})
>dfmfile.index.name='index'
>dfmfile['CID']= dfmfile['CH']+dfmfile['Ln'].fillna('')
>
>dfmfile['MID'] = dfmfile['MID']
>dfmfile['FromDate'] = pd.to_datetime(dfmfile['FromDate'])
>dfmfile['ToDate'] = pd.to_datetime(dfmfile['ToDate'])
>
>print dfmfile
>dfmsort= dfmfile.sort_values(['MID', 'FromDate','ToDate'], ascending=[True, True, True])
>
>dfmsort['MID'] = dfmsort['MID']
>dfmsort['FromDate'] = pd.to_datetime(dfmsort['FromDate'])
>dfmsort['ToDate'] = pd.to_datetime(dfmsort['ToDate'])
>
>dfmsort3=dfmsort.reset_index()
>print dfmsort
>dfunique = pd.DataFrame(data=None,columns=['MID'])
>dfunique['MID']= dfmsort3['MID'].unique()
>#dfunique['MID'] = pd.DataFrame(dfmsort3['MID'].unique(), dtype='str')
>
>dfc = pd.DataFrame(data=None,columns=['MID','AD','DD','CID'])
>
>
>print dfunique
>for row in dfunique.itertuples():
> dfcm = dfmsort3.loc[dfmsort3['MID'] == row.MID].reset_index()
>
> i = dfcm.index.min()
> minindx = i
> maxindx =dfcm.index.max()
>
> MbrId = row.MIDI want to compare Row1.ToDAte = Row2.FromDate or Row2.FromDate-1. Only when that condition satisfies I want to do the group by by ID. The number of rows to compare could be more than 2,3,4,5,6 or n. So simple group by does not work. I want to avoid iterating through rows.
1.Sort the dataframe. Convert string to Dates
2.For a MID: Compare Row1_ToDate with Row2_FromDate
2.1.Initialize: AD = Row1_ToDate
3.IfRow1.ToDate == Row2_FromDate OR Row1_ToDate+1Day == Row2_FromDate
3.1.DD= Row2_ToDate and concatenate CID+Ln and merge these values from
row1 and Row2
4.if Row1_ToDate != Row2_FromDate AND Row1_ToDate+1Day != Row2_FromDate
4.1.DD= Row1_ToDate
[Input and Output][1]
[1]: https://i.stack.imgur.com/fFbHd.png
`
>import pandas as pd
>import numpy as np
>import os
>import os.path
>from datetime import datetime, timedelta
>
>dfmfile = pd.DataFrame({'MID':
>['1000','1000','1001','1001','1002','1002','1003','1003','1004','1004',
> '1005','1006','1007','1007','1007','1007','1008','1008'], 'FromDate':
>['2017-06-12','2017-06-17','2017-04-16','2017-04-18','2017-10-13',
>'2017-10-18','2017-05-21','2017-05-24','2017-02-11','2017-02-11',
>'2017-06-13','2017-03-07','2016-07-05','2017-04-11','2017-04-14',
>'2017-04-18','2015-05-18','2016-05-23'],'ToDate':['2017-06-17',
>'2017-06-22','2017-04-18','2017-04-22','2017-10-17','2017-10-22',
>'2017-05-23','2017-05-27','2017-02-15','2017-02-11','2017-06-18',
>'2017-03-09','2016-07-05','2017-04-13','2017-04-17','2017-04-22',
>'2015-05-22','2016-05-23'],'CH':
>['1000','1000','1001','1001','1002','1003','1004','1005','1006', '1007',
>'1008','1009','1010','1011','1012','1013','1014','1014'],'Ln':
>['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16',
>'17','18']})
>dfmfile.index.name='index'
>dfmfile['CID']= dfmfile['CH']+dfmfile['Ln'].fillna('')
>
>dfmfile['MID'] = dfmfile['MID']
>dfmfile['FromDate'] = pd.to_datetime(dfmfile['FromDate'])
>dfmfile['ToDate'] = pd.to_datetime(dfmfile['ToDate'])
>
>print dfmfile
>dfmsort= dfmfile.sort_values(['MID', 'FromDate','ToDate'], ascending=
>[True,
>True, True])
>
>dfmsort['MID'] = dfmsort['MID']
>dfmsort['FromDate'] = pd.to_datetime(dfmsort['FromDate'])
>dfmsort['ToDate'] = pd.to_datetime(dfmsort['ToDate'])
>
>dfmsort3=dfmsort.reset_index()
>print dfmsort
>dfunique = pd.DataFrame(data=None,columns=['MID'])
>dfunique['MID']= dfmsort3['MID'].unique()
>
>dfc = pd.DataFrame(data=None,columns=['MID','AD','DD','CID'])
>
>
>print dfunique
>for row in dfunique.itertuples():
> dfcm = dfmsort3.loc[dfmsort3['MID'] == row.MID].reset_index()
>
> i = dfcm.index.min()
> minindx = i
> maxindx =dfcm.index.max()
>
> MbrId = row.MID
> CID=''
> AdmitDate =''
> DischargeDate=''
> while (minindx <= maxindx):
> priorrow = dfcm.loc[minindx]
>
> if minindx <= maxindx:
> if (AdmitDate == ''):
> AdmitDate = priorrow.FromDate
>
> if minindx+1 <= maxindx:
> nextrow=dfcm.loc[minindx+1]
> if ((priorrow.ToDate + timedelta(days=1) == nextrow.FromDate)|
>(priorrow.ToDate == nextrow.FromDate)):
>
> DischargeDate = nextrow.ToDate
> if CID == '':
> if priorrow.CID not in CID:
> CID= priorrow.CID
> if nextrow.CID not in CID:
> CID= nextrow.CID
> if CID != '':
> if priorrow.CID not in CID:
> CID= (CID+';'+priorrow.CID)
> if nextrow.CID not in CID:
> CID= (CID+';'+nextrow.CID)
>
> if(((priorrow.ToDate + timedelta(days=1) != nextrow.FromDate) &
>(priorrow.ToDate != nextrow.FromDate))|(minindx == maxindx)):
>
> DischargeDate = priorrow.ToDate
> if ((CID == '') & (priorrow.CID not in CID)):
> CID= priorrow.CID
> if ((CID != '') & (priorrow.CID not in CID)):
> CID= (CID+';'+priorrow.CID)
>
> dfc=pd.DataFrame(np.array([[MbrId, AdmitDate, DischargeDate,
> CID]]), columns=['MID','AD','DD','CID']).append(dfc, ignore_index=True)
>
> CID=''
> AdmitDate =''
> DischargeDate=''
>
> minindx = minindx + 1
>
>print dfc
>
> CID=''
> AdmitDate =''
> DischargeDate=''
> while (minindx <= maxindx):
> priorrow = dfcm.loc[minindx]
>
> if minindx <= maxindx:
> if (AdmitDate == ''):
> AdmitDate = priorrow.FromDate
>
> if minindx+1 <= maxindx:
> nextrow=dfcm.loc[minindx+1]
> if ((priorrow.ToDate + timedelta(days=1) == nextrow.FromDate)|
>(priorrow.ToDate == nextrow.FromDate)):
>
> DischargeDate = nextrow.ToDate
> if CID == '':
> if priorrow.CID not in CID:
> CID= priorrow.CID
> if nextrow.CID not in CID:
> CID= nextrow.CID
> if CID != '':
> if priorrow.CID not in CID:
> CID= (CID+';'+priorrow.CID)
> if nextrow.CID not in CID:
> CID= (CID+';'+nextrow.CID)
>
> if(((priorrow.ToDate + timedelta(days=1) != nextrow.FromDate) &
>(priorrow.ToDate != nextrow.FromDate))|(minindx == maxindx)):
>
> DischargeDate = priorrow.ToDate
> if ((CID == '') & (priorrow.CID not in CID)):
> CID= priorrow.CID
> if ((CID != '') & (priorrow.CID not in CID)):
> CID= (CID+';'+priorrow.CID)
>
> dfc=pd.DataFrame(np.array([[MbrId, AdmitDate, DischargeDate,
>CID]]), columns=['MID','AD','DD','CID']).append(dfc, ignore_index=True)
>
> CID=''
> AdmitDate =''
> DischargeDate=''
>
> minindx = minindx + 1
>
>print dfc
python pandas
New contributor
closed as off-topic by timgeb, TheIncorrigible1, jpp, Daniel Mesejo, TDG Nov 10 at 15:57
This question appears to be off-topic. The users who voted to close gave this specific reason:
- "Questions seeking debugging help ("why isn't this code working?") must include the desired behavior, a specific problem or error and the shortest code necessary to reproduce it in the question itself. Questions without a clear problem statement are not useful to other readers. See: How to create a Minimal, Complete, and Verifiable example." – timgeb, TheIncorrigible1, jpp, Daniel Mesejo, TDG
If this question can be reworded to fit the rules in the help center, please edit the question.
1
Are you after:df.groupby('ID', as_index=False).agg({'FromDate': 'min', 'ToDate': 'max'})
?
– Jon Clements♦
Nov 10 at 15:39
I want to compare Row1.ToDAte = Row2.FromDate or Row2.FromDate-1. Only when that condition satisfies I want to do the group by. The number of rows to compare could be more than 2,3,4,5,6 or n. So simple group by does not work. So looking for help
– user10633376
Nov 10 at 16:03
1
Might want to edit your question to include that and maybe elaborate on it a bit as to why the other suggestions offered don't work. You can see there's already confusion over what you're asking. Please be as detailed as possible as to the logic you're after and how it ends up with the results you've specified.
– Jon Clements♦
Nov 10 at 16:04
Question is not clear , you need to make it explicit worth readable to get the better inputs, also would be great what you tried to achieve this so reader can get a clue..
– pygo
Nov 10 at 16:16
add a comment |
up vote
0
down vote
favorite
up vote
0
down vote
favorite
I need to do row wise comparison. When certain condition satisfies then I want to perform the group by.
Currently I have the code to do row wise comparison using while loop. I would like to avoid loops and looking for a way at the dataframe level.
The condition for group by: For a MID if the Row1 ToDate is equal to Row2 FromDate or Row1 ToDate plus one day is equal to Row2 Fromdate - then those two rows needs to be concatenated taking the Row1Todate as Fromdate and Row2ToDate as Todate.
Input, Output, steps
>import pandas as pd
>import numpy as np
>import os
>import os.path
>from datetime import datetime, timedelta
>
>dfmfile = pd.DataFrame({'MID':['1000','1000','1001','1001','1002','1002','1003','1003','1004','1004','1005','1006','1007','1007','1007','1007','1008','1008'],'FromDate':['2017-06-12','2017-06-17','2017-04-16','2017-04-18','2017-10-13','2017-10-18','2017-05-21','2017-05-24','2017-02-11','2017-02-11','2017-06-13','2017-03-07','2016-07-05','2017-04-11','2017-04-14','2017-04-18','2015-05-18','2016-05-23'],'ToDate':['2017-06-17','2017-06-22','2017-04-18','2017-04-22','2017-10-17','2017-10-22','2017-05-23','2017-05-27','2017-02-15','2017-02-11','2017-06-18','2017-03-09','2016-07-05','2017-04-13','2017-04-17','2017-04-22','2015-05-22','2016-05-23'],'CH':['1000','1000','1001','1001','1002','1003','1004','1005','1006','1007','1008','1009','1010','1011','1012','1013','1014','1014'],'Ln':['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18']})
>dfmfile.index.name='index'
>dfmfile['CID']= dfmfile['CH']+dfmfile['Ln'].fillna('')
>
>dfmfile['MID'] = dfmfile['MID']
>dfmfile['FromDate'] = pd.to_datetime(dfmfile['FromDate'])
>dfmfile['ToDate'] = pd.to_datetime(dfmfile['ToDate'])
>
>print dfmfile
>dfmsort= dfmfile.sort_values(['MID', 'FromDate','ToDate'], ascending=[True, True, True])
>
>dfmsort['MID'] = dfmsort['MID']
>dfmsort['FromDate'] = pd.to_datetime(dfmsort['FromDate'])
>dfmsort['ToDate'] = pd.to_datetime(dfmsort['ToDate'])
>
>dfmsort3=dfmsort.reset_index()
>print dfmsort
>dfunique = pd.DataFrame(data=None,columns=['MID'])
>dfunique['MID']= dfmsort3['MID'].unique()
>#dfunique['MID'] = pd.DataFrame(dfmsort3['MID'].unique(), dtype='str')
>
>dfc = pd.DataFrame(data=None,columns=['MID','AD','DD','CID'])
>
>
>print dfunique
>for row in dfunique.itertuples():
> dfcm = dfmsort3.loc[dfmsort3['MID'] == row.MID].reset_index()
>
> i = dfcm.index.min()
> minindx = i
> maxindx =dfcm.index.max()
>
> MbrId = row.MIDI want to compare Row1.ToDAte = Row2.FromDate or Row2.FromDate-1. Only when that condition satisfies I want to do the group by by ID. The number of rows to compare could be more than 2,3,4,5,6 or n. So simple group by does not work. I want to avoid iterating through rows.
1.Sort the dataframe. Convert string to Dates
2.For a MID: Compare Row1_ToDate with Row2_FromDate
2.1.Initialize: AD = Row1_ToDate
3.IfRow1.ToDate == Row2_FromDate OR Row1_ToDate+1Day == Row2_FromDate
3.1.DD= Row2_ToDate and concatenate CID+Ln and merge these values from
row1 and Row2
4.if Row1_ToDate != Row2_FromDate AND Row1_ToDate+1Day != Row2_FromDate
4.1.DD= Row1_ToDate
[Input and Output][1]
[1]: https://i.stack.imgur.com/fFbHd.png
`
>import pandas as pd
>import numpy as np
>import os
>import os.path
>from datetime import datetime, timedelta
>
>dfmfile = pd.DataFrame({'MID':
>['1000','1000','1001','1001','1002','1002','1003','1003','1004','1004',
> '1005','1006','1007','1007','1007','1007','1008','1008'], 'FromDate':
>['2017-06-12','2017-06-17','2017-04-16','2017-04-18','2017-10-13',
>'2017-10-18','2017-05-21','2017-05-24','2017-02-11','2017-02-11',
>'2017-06-13','2017-03-07','2016-07-05','2017-04-11','2017-04-14',
>'2017-04-18','2015-05-18','2016-05-23'],'ToDate':['2017-06-17',
>'2017-06-22','2017-04-18','2017-04-22','2017-10-17','2017-10-22',
>'2017-05-23','2017-05-27','2017-02-15','2017-02-11','2017-06-18',
>'2017-03-09','2016-07-05','2017-04-13','2017-04-17','2017-04-22',
>'2015-05-22','2016-05-23'],'CH':
>['1000','1000','1001','1001','1002','1003','1004','1005','1006', '1007',
>'1008','1009','1010','1011','1012','1013','1014','1014'],'Ln':
>['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16',
>'17','18']})
>dfmfile.index.name='index'
>dfmfile['CID']= dfmfile['CH']+dfmfile['Ln'].fillna('')
>
>dfmfile['MID'] = dfmfile['MID']
>dfmfile['FromDate'] = pd.to_datetime(dfmfile['FromDate'])
>dfmfile['ToDate'] = pd.to_datetime(dfmfile['ToDate'])
>
>print dfmfile
>dfmsort= dfmfile.sort_values(['MID', 'FromDate','ToDate'], ascending=
>[True,
>True, True])
>
>dfmsort['MID'] = dfmsort['MID']
>dfmsort['FromDate'] = pd.to_datetime(dfmsort['FromDate'])
>dfmsort['ToDate'] = pd.to_datetime(dfmsort['ToDate'])
>
>dfmsort3=dfmsort.reset_index()
>print dfmsort
>dfunique = pd.DataFrame(data=None,columns=['MID'])
>dfunique['MID']= dfmsort3['MID'].unique()
>
>dfc = pd.DataFrame(data=None,columns=['MID','AD','DD','CID'])
>
>
>print dfunique
>for row in dfunique.itertuples():
> dfcm = dfmsort3.loc[dfmsort3['MID'] == row.MID].reset_index()
>
> i = dfcm.index.min()
> minindx = i
> maxindx =dfcm.index.max()
>
> MbrId = row.MID
> CID=''
> AdmitDate =''
> DischargeDate=''
> while (minindx <= maxindx):
> priorrow = dfcm.loc[minindx]
>
> if minindx <= maxindx:
> if (AdmitDate == ''):
> AdmitDate = priorrow.FromDate
>
> if minindx+1 <= maxindx:
> nextrow=dfcm.loc[minindx+1]
> if ((priorrow.ToDate + timedelta(days=1) == nextrow.FromDate)|
>(priorrow.ToDate == nextrow.FromDate)):
>
> DischargeDate = nextrow.ToDate
> if CID == '':
> if priorrow.CID not in CID:
> CID= priorrow.CID
> if nextrow.CID not in CID:
> CID= nextrow.CID
> if CID != '':
> if priorrow.CID not in CID:
> CID= (CID+';'+priorrow.CID)
> if nextrow.CID not in CID:
> CID= (CID+';'+nextrow.CID)
>
> if(((priorrow.ToDate + timedelta(days=1) != nextrow.FromDate) &
>(priorrow.ToDate != nextrow.FromDate))|(minindx == maxindx)):
>
> DischargeDate = priorrow.ToDate
> if ((CID == '') & (priorrow.CID not in CID)):
> CID= priorrow.CID
> if ((CID != '') & (priorrow.CID not in CID)):
> CID= (CID+';'+priorrow.CID)
>
> dfc=pd.DataFrame(np.array([[MbrId, AdmitDate, DischargeDate,
> CID]]), columns=['MID','AD','DD','CID']).append(dfc, ignore_index=True)
>
> CID=''
> AdmitDate =''
> DischargeDate=''
>
> minindx = minindx + 1
>
>print dfc
>
> CID=''
> AdmitDate =''
> DischargeDate=''
> while (minindx <= maxindx):
> priorrow = dfcm.loc[minindx]
>
> if minindx <= maxindx:
> if (AdmitDate == ''):
> AdmitDate = priorrow.FromDate
>
> if minindx+1 <= maxindx:
> nextrow=dfcm.loc[minindx+1]
> if ((priorrow.ToDate + timedelta(days=1) == nextrow.FromDate)|
>(priorrow.ToDate == nextrow.FromDate)):
>
> DischargeDate = nextrow.ToDate
> if CID == '':
> if priorrow.CID not in CID:
> CID= priorrow.CID
> if nextrow.CID not in CID:
> CID= nextrow.CID
> if CID != '':
> if priorrow.CID not in CID:
> CID= (CID+';'+priorrow.CID)
> if nextrow.CID not in CID:
> CID= (CID+';'+nextrow.CID)
>
> if(((priorrow.ToDate + timedelta(days=1) != nextrow.FromDate) &
>(priorrow.ToDate != nextrow.FromDate))|(minindx == maxindx)):
>
> DischargeDate = priorrow.ToDate
> if ((CID == '') & (priorrow.CID not in CID)):
> CID= priorrow.CID
> if ((CID != '') & (priorrow.CID not in CID)):
> CID= (CID+';'+priorrow.CID)
>
> dfc=pd.DataFrame(np.array([[MbrId, AdmitDate, DischargeDate,
>CID]]), columns=['MID','AD','DD','CID']).append(dfc, ignore_index=True)
>
> CID=''
> AdmitDate =''
> DischargeDate=''
>
> minindx = minindx + 1
>
>print dfc
python pandas
New contributor
I need to do row wise comparison. When certain condition satisfies then I want to perform the group by.
Currently I have the code to do row wise comparison using while loop. I would like to avoid loops and looking for a way at the dataframe level.
The condition for group by: For a MID if the Row1 ToDate is equal to Row2 FromDate or Row1 ToDate plus one day is equal to Row2 Fromdate - then those two rows needs to be concatenated taking the Row1Todate as Fromdate and Row2ToDate as Todate.
Input, Output, steps
>import pandas as pd
>import numpy as np
>import os
>import os.path
>from datetime import datetime, timedelta
>
>dfmfile = pd.DataFrame({'MID':['1000','1000','1001','1001','1002','1002','1003','1003','1004','1004','1005','1006','1007','1007','1007','1007','1008','1008'],'FromDate':['2017-06-12','2017-06-17','2017-04-16','2017-04-18','2017-10-13','2017-10-18','2017-05-21','2017-05-24','2017-02-11','2017-02-11','2017-06-13','2017-03-07','2016-07-05','2017-04-11','2017-04-14','2017-04-18','2015-05-18','2016-05-23'],'ToDate':['2017-06-17','2017-06-22','2017-04-18','2017-04-22','2017-10-17','2017-10-22','2017-05-23','2017-05-27','2017-02-15','2017-02-11','2017-06-18','2017-03-09','2016-07-05','2017-04-13','2017-04-17','2017-04-22','2015-05-22','2016-05-23'],'CH':['1000','1000','1001','1001','1002','1003','1004','1005','1006','1007','1008','1009','1010','1011','1012','1013','1014','1014'],'Ln':['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18']})
>dfmfile.index.name='index'
>dfmfile['CID']= dfmfile['CH']+dfmfile['Ln'].fillna('')
>
>dfmfile['MID'] = dfmfile['MID']
>dfmfile['FromDate'] = pd.to_datetime(dfmfile['FromDate'])
>dfmfile['ToDate'] = pd.to_datetime(dfmfile['ToDate'])
>
>print dfmfile
>dfmsort= dfmfile.sort_values(['MID', 'FromDate','ToDate'], ascending=[True, True, True])
>
>dfmsort['MID'] = dfmsort['MID']
>dfmsort['FromDate'] = pd.to_datetime(dfmsort['FromDate'])
>dfmsort['ToDate'] = pd.to_datetime(dfmsort['ToDate'])
>
>dfmsort3=dfmsort.reset_index()
>print dfmsort
>dfunique = pd.DataFrame(data=None,columns=['MID'])
>dfunique['MID']= dfmsort3['MID'].unique()
>#dfunique['MID'] = pd.DataFrame(dfmsort3['MID'].unique(), dtype='str')
>
>dfc = pd.DataFrame(data=None,columns=['MID','AD','DD','CID'])
>
>
>print dfunique
>for row in dfunique.itertuples():
> dfcm = dfmsort3.loc[dfmsort3['MID'] == row.MID].reset_index()
>
> i = dfcm.index.min()
> minindx = i
> maxindx =dfcm.index.max()
>
> MbrId = row.MIDI want to compare Row1.ToDAte = Row2.FromDate or Row2.FromDate-1. Only when that condition satisfies I want to do the group by by ID. The number of rows to compare could be more than 2,3,4,5,6 or n. So simple group by does not work. I want to avoid iterating through rows.
1.Sort the dataframe. Convert string to Dates
2.For a MID: Compare Row1_ToDate with Row2_FromDate
2.1.Initialize: AD = Row1_ToDate
3.IfRow1.ToDate == Row2_FromDate OR Row1_ToDate+1Day == Row2_FromDate
3.1.DD= Row2_ToDate and concatenate CID+Ln and merge these values from
row1 and Row2
4.if Row1_ToDate != Row2_FromDate AND Row1_ToDate+1Day != Row2_FromDate
4.1.DD= Row1_ToDate
[Input and Output][1]
[1]: https://i.stack.imgur.com/fFbHd.png
`
>import pandas as pd
>import numpy as np
>import os
>import os.path
>from datetime import datetime, timedelta
>
>dfmfile = pd.DataFrame({'MID':
>['1000','1000','1001','1001','1002','1002','1003','1003','1004','1004',
> '1005','1006','1007','1007','1007','1007','1008','1008'], 'FromDate':
>['2017-06-12','2017-06-17','2017-04-16','2017-04-18','2017-10-13',
>'2017-10-18','2017-05-21','2017-05-24','2017-02-11','2017-02-11',
>'2017-06-13','2017-03-07','2016-07-05','2017-04-11','2017-04-14',
>'2017-04-18','2015-05-18','2016-05-23'],'ToDate':['2017-06-17',
>'2017-06-22','2017-04-18','2017-04-22','2017-10-17','2017-10-22',
>'2017-05-23','2017-05-27','2017-02-15','2017-02-11','2017-06-18',
>'2017-03-09','2016-07-05','2017-04-13','2017-04-17','2017-04-22',
>'2015-05-22','2016-05-23'],'CH':
>['1000','1000','1001','1001','1002','1003','1004','1005','1006', '1007',
>'1008','1009','1010','1011','1012','1013','1014','1014'],'Ln':
>['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16',
>'17','18']})
>dfmfile.index.name='index'
>dfmfile['CID']= dfmfile['CH']+dfmfile['Ln'].fillna('')
>
>dfmfile['MID'] = dfmfile['MID']
>dfmfile['FromDate'] = pd.to_datetime(dfmfile['FromDate'])
>dfmfile['ToDate'] = pd.to_datetime(dfmfile['ToDate'])
>
>print dfmfile
>dfmsort= dfmfile.sort_values(['MID', 'FromDate','ToDate'], ascending=
>[True,
>True, True])
>
>dfmsort['MID'] = dfmsort['MID']
>dfmsort['FromDate'] = pd.to_datetime(dfmsort['FromDate'])
>dfmsort['ToDate'] = pd.to_datetime(dfmsort['ToDate'])
>
>dfmsort3=dfmsort.reset_index()
>print dfmsort
>dfunique = pd.DataFrame(data=None,columns=['MID'])
>dfunique['MID']= dfmsort3['MID'].unique()
>
>dfc = pd.DataFrame(data=None,columns=['MID','AD','DD','CID'])
>
>
>print dfunique
>for row in dfunique.itertuples():
> dfcm = dfmsort3.loc[dfmsort3['MID'] == row.MID].reset_index()
>
> i = dfcm.index.min()
> minindx = i
> maxindx =dfcm.index.max()
>
> MbrId = row.MID
> CID=''
> AdmitDate =''
> DischargeDate=''
> while (minindx <= maxindx):
> priorrow = dfcm.loc[minindx]
>
> if minindx <= maxindx:
> if (AdmitDate == ''):
> AdmitDate = priorrow.FromDate
>
> if minindx+1 <= maxindx:
> nextrow=dfcm.loc[minindx+1]
> if ((priorrow.ToDate + timedelta(days=1) == nextrow.FromDate)|
>(priorrow.ToDate == nextrow.FromDate)):
>
> DischargeDate = nextrow.ToDate
> if CID == '':
> if priorrow.CID not in CID:
> CID= priorrow.CID
> if nextrow.CID not in CID:
> CID= nextrow.CID
> if CID != '':
> if priorrow.CID not in CID:
> CID= (CID+';'+priorrow.CID)
> if nextrow.CID not in CID:
> CID= (CID+';'+nextrow.CID)
>
> if(((priorrow.ToDate + timedelta(days=1) != nextrow.FromDate) &
>(priorrow.ToDate != nextrow.FromDate))|(minindx == maxindx)):
>
> DischargeDate = priorrow.ToDate
> if ((CID == '') & (priorrow.CID not in CID)):
> CID= priorrow.CID
> if ((CID != '') & (priorrow.CID not in CID)):
> CID= (CID+';'+priorrow.CID)
>
> dfc=pd.DataFrame(np.array([[MbrId, AdmitDate, DischargeDate,
> CID]]), columns=['MID','AD','DD','CID']).append(dfc, ignore_index=True)
>
> CID=''
> AdmitDate =''
> DischargeDate=''
>
> minindx = minindx + 1
>
>print dfc
>
> CID=''
> AdmitDate =''
> DischargeDate=''
> while (minindx <= maxindx):
> priorrow = dfcm.loc[minindx]
>
> if minindx <= maxindx:
> if (AdmitDate == ''):
> AdmitDate = priorrow.FromDate
>
> if minindx+1 <= maxindx:
> nextrow=dfcm.loc[minindx+1]
> if ((priorrow.ToDate + timedelta(days=1) == nextrow.FromDate)|
>(priorrow.ToDate == nextrow.FromDate)):
>
> DischargeDate = nextrow.ToDate
> if CID == '':
> if priorrow.CID not in CID:
> CID= priorrow.CID
> if nextrow.CID not in CID:
> CID= nextrow.CID
> if CID != '':
> if priorrow.CID not in CID:
> CID= (CID+';'+priorrow.CID)
> if nextrow.CID not in CID:
> CID= (CID+';'+nextrow.CID)
>
> if(((priorrow.ToDate + timedelta(days=1) != nextrow.FromDate) &
>(priorrow.ToDate != nextrow.FromDate))|(minindx == maxindx)):
>
> DischargeDate = priorrow.ToDate
> if ((CID == '') & (priorrow.CID not in CID)):
> CID= priorrow.CID
> if ((CID != '') & (priorrow.CID not in CID)):
> CID= (CID+';'+priorrow.CID)
>
> dfc=pd.DataFrame(np.array([[MbrId, AdmitDate, DischargeDate,
>CID]]), columns=['MID','AD','DD','CID']).append(dfc, ignore_index=True)
>
> CID=''
> AdmitDate =''
> DischargeDate=''
>
> minindx = minindx + 1
>
>print dfc
python pandas
python pandas
New contributor
New contributor
edited Nov 11 at 21:16
New contributor
asked Nov 10 at 15:34
user10633376
43
43
New contributor
New contributor
closed as off-topic by timgeb, TheIncorrigible1, jpp, Daniel Mesejo, TDG Nov 10 at 15:57
This question appears to be off-topic. The users who voted to close gave this specific reason:
- "Questions seeking debugging help ("why isn't this code working?") must include the desired behavior, a specific problem or error and the shortest code necessary to reproduce it in the question itself. Questions without a clear problem statement are not useful to other readers. See: How to create a Minimal, Complete, and Verifiable example." – timgeb, TheIncorrigible1, jpp, Daniel Mesejo, TDG
If this question can be reworded to fit the rules in the help center, please edit the question.
closed as off-topic by timgeb, TheIncorrigible1, jpp, Daniel Mesejo, TDG Nov 10 at 15:57
This question appears to be off-topic. The users who voted to close gave this specific reason:
- "Questions seeking debugging help ("why isn't this code working?") must include the desired behavior, a specific problem or error and the shortest code necessary to reproduce it in the question itself. Questions without a clear problem statement are not useful to other readers. See: How to create a Minimal, Complete, and Verifiable example." – timgeb, TheIncorrigible1, jpp, Daniel Mesejo, TDG
If this question can be reworded to fit the rules in the help center, please edit the question.
1
Are you after:df.groupby('ID', as_index=False).agg({'FromDate': 'min', 'ToDate': 'max'})
?
– Jon Clements♦
Nov 10 at 15:39
I want to compare Row1.ToDAte = Row2.FromDate or Row2.FromDate-1. Only when that condition satisfies I want to do the group by. The number of rows to compare could be more than 2,3,4,5,6 or n. So simple group by does not work. So looking for help
– user10633376
Nov 10 at 16:03
1
Might want to edit your question to include that and maybe elaborate on it a bit as to why the other suggestions offered don't work. You can see there's already confusion over what you're asking. Please be as detailed as possible as to the logic you're after and how it ends up with the results you've specified.
– Jon Clements♦
Nov 10 at 16:04
Question is not clear , you need to make it explicit worth readable to get the better inputs, also would be great what you tried to achieve this so reader can get a clue..
– pygo
Nov 10 at 16:16
add a comment |
1
Are you after:df.groupby('ID', as_index=False).agg({'FromDate': 'min', 'ToDate': 'max'})
?
– Jon Clements♦
Nov 10 at 15:39
I want to compare Row1.ToDAte = Row2.FromDate or Row2.FromDate-1. Only when that condition satisfies I want to do the group by. The number of rows to compare could be more than 2,3,4,5,6 or n. So simple group by does not work. So looking for help
– user10633376
Nov 10 at 16:03
1
Might want to edit your question to include that and maybe elaborate on it a bit as to why the other suggestions offered don't work. You can see there's already confusion over what you're asking. Please be as detailed as possible as to the logic you're after and how it ends up with the results you've specified.
– Jon Clements♦
Nov 10 at 16:04
Question is not clear , you need to make it explicit worth readable to get the better inputs, also would be great what you tried to achieve this so reader can get a clue..
– pygo
Nov 10 at 16:16
1
1
Are you after:
df.groupby('ID', as_index=False).agg({'FromDate': 'min', 'ToDate': 'max'})
?– Jon Clements♦
Nov 10 at 15:39
Are you after:
df.groupby('ID', as_index=False).agg({'FromDate': 'min', 'ToDate': 'max'})
?– Jon Clements♦
Nov 10 at 15:39
I want to compare Row1.ToDAte = Row2.FromDate or Row2.FromDate-1. Only when that condition satisfies I want to do the group by. The number of rows to compare could be more than 2,3,4,5,6 or n. So simple group by does not work. So looking for help
– user10633376
Nov 10 at 16:03
I want to compare Row1.ToDAte = Row2.FromDate or Row2.FromDate-1. Only when that condition satisfies I want to do the group by. The number of rows to compare could be more than 2,3,4,5,6 or n. So simple group by does not work. So looking for help
– user10633376
Nov 10 at 16:03
1
1
Might want to edit your question to include that and maybe elaborate on it a bit as to why the other suggestions offered don't work. You can see there's already confusion over what you're asking. Please be as detailed as possible as to the logic you're after and how it ends up with the results you've specified.
– Jon Clements♦
Nov 10 at 16:04
Might want to edit your question to include that and maybe elaborate on it a bit as to why the other suggestions offered don't work. You can see there's already confusion over what you're asking. Please be as detailed as possible as to the logic you're after and how it ends up with the results you've specified.
– Jon Clements♦
Nov 10 at 16:04
Question is not clear , you need to make it explicit worth readable to get the better inputs, also would be great what you tried to achieve this so reader can get a clue..
– pygo
Nov 10 at 16:16
Question is not clear , you need to make it explicit worth readable to get the better inputs, also would be great what you tried to achieve this so reader can get a clue..
– pygo
Nov 10 at 16:16
add a comment |
2 Answers
2
active
oldest
votes
up vote
0
down vote
First of all you probably want to set the ID
column to be the index:
df.set_index('ID', inplace = True)
Then you can convert your columns to some time format (e.g. datetime
), you cannot subtract strings from each other:
df['FromDate'] = pd.to_datetime(df['FromDate'])
df['ToDate'] = pd.to_datetime(df['ToDate'])
After doing this the pandas.Series.shift
function will do the job:
df['diff'] = df['FromDate'] - df['ToDate'].shift()
Now you have the differences in the diff column.
New contributor
Thanks for the reply, I have tried that and it works only if I have to compare 2 rows. But if you see the id 95007 it has more than 2 records. I want to combine 3 or 4 or n records as long as it satisfies the todate +1 from previous row matches with FromDate on the next row. That is why simple group by with shift and shift+groupby+cumsum does not work for me
– user10633376
Nov 10 at 15:55
add a comment |
up vote
0
down vote
How about as @Jon mentioned.
>>> df.groupby('ID', as_index=False).agg({'FromDate': 'min', 'ToDate': 'max'})
ID FromDate ToDate
0 95000 2017-06-12 2017-06-22
1 95001 2017-04-16 2017-04-22
2 95002 2017-10-13 2017-10-22
3 95003 2017-05-21 2017-05-27
4 95004 2017-02-11 2017-02-15
5 95005 2017-06-13 2017-06-18
6 95006 2017-03-07 2017-03-09
7 95007 2016-07-05 2017-04-22
8 95008 2015-05-18 2016-05-23
For 950007 I expect 2 records : first record 7/5 to 7/5 and second one 4/11 to 4/22
– user10633376
Nov 10 at 16:32
add a comment |
2 Answers
2
active
oldest
votes
2 Answers
2
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
0
down vote
First of all you probably want to set the ID
column to be the index:
df.set_index('ID', inplace = True)
Then you can convert your columns to some time format (e.g. datetime
), you cannot subtract strings from each other:
df['FromDate'] = pd.to_datetime(df['FromDate'])
df['ToDate'] = pd.to_datetime(df['ToDate'])
After doing this the pandas.Series.shift
function will do the job:
df['diff'] = df['FromDate'] - df['ToDate'].shift()
Now you have the differences in the diff column.
New contributor
Thanks for the reply, I have tried that and it works only if I have to compare 2 rows. But if you see the id 95007 it has more than 2 records. I want to combine 3 or 4 or n records as long as it satisfies the todate +1 from previous row matches with FromDate on the next row. That is why simple group by with shift and shift+groupby+cumsum does not work for me
– user10633376
Nov 10 at 15:55
add a comment |
up vote
0
down vote
First of all you probably want to set the ID
column to be the index:
df.set_index('ID', inplace = True)
Then you can convert your columns to some time format (e.g. datetime
), you cannot subtract strings from each other:
df['FromDate'] = pd.to_datetime(df['FromDate'])
df['ToDate'] = pd.to_datetime(df['ToDate'])
After doing this the pandas.Series.shift
function will do the job:
df['diff'] = df['FromDate'] - df['ToDate'].shift()
Now you have the differences in the diff column.
New contributor
Thanks for the reply, I have tried that and it works only if I have to compare 2 rows. But if you see the id 95007 it has more than 2 records. I want to combine 3 or 4 or n records as long as it satisfies the todate +1 from previous row matches with FromDate on the next row. That is why simple group by with shift and shift+groupby+cumsum does not work for me
– user10633376
Nov 10 at 15:55
add a comment |
up vote
0
down vote
up vote
0
down vote
First of all you probably want to set the ID
column to be the index:
df.set_index('ID', inplace = True)
Then you can convert your columns to some time format (e.g. datetime
), you cannot subtract strings from each other:
df['FromDate'] = pd.to_datetime(df['FromDate'])
df['ToDate'] = pd.to_datetime(df['ToDate'])
After doing this the pandas.Series.shift
function will do the job:
df['diff'] = df['FromDate'] - df['ToDate'].shift()
Now you have the differences in the diff column.
New contributor
First of all you probably want to set the ID
column to be the index:
df.set_index('ID', inplace = True)
Then you can convert your columns to some time format (e.g. datetime
), you cannot subtract strings from each other:
df['FromDate'] = pd.to_datetime(df['FromDate'])
df['ToDate'] = pd.to_datetime(df['ToDate'])
After doing this the pandas.Series.shift
function will do the job:
df['diff'] = df['FromDate'] - df['ToDate'].shift()
Now you have the differences in the diff column.
New contributor
New contributor
answered Nov 10 at 15:49
zsomko
212
212
New contributor
New contributor
Thanks for the reply, I have tried that and it works only if I have to compare 2 rows. But if you see the id 95007 it has more than 2 records. I want to combine 3 or 4 or n records as long as it satisfies the todate +1 from previous row matches with FromDate on the next row. That is why simple group by with shift and shift+groupby+cumsum does not work for me
– user10633376
Nov 10 at 15:55
add a comment |
Thanks for the reply, I have tried that and it works only if I have to compare 2 rows. But if you see the id 95007 it has more than 2 records. I want to combine 3 or 4 or n records as long as it satisfies the todate +1 from previous row matches with FromDate on the next row. That is why simple group by with shift and shift+groupby+cumsum does not work for me
– user10633376
Nov 10 at 15:55
Thanks for the reply, I have tried that and it works only if I have to compare 2 rows. But if you see the id 95007 it has more than 2 records. I want to combine 3 or 4 or n records as long as it satisfies the todate +1 from previous row matches with FromDate on the next row. That is why simple group by with shift and shift+groupby+cumsum does not work for me
– user10633376
Nov 10 at 15:55
Thanks for the reply, I have tried that and it works only if I have to compare 2 rows. But if you see the id 95007 it has more than 2 records. I want to combine 3 or 4 or n records as long as it satisfies the todate +1 from previous row matches with FromDate on the next row. That is why simple group by with shift and shift+groupby+cumsum does not work for me
– user10633376
Nov 10 at 15:55
add a comment |
up vote
0
down vote
How about as @Jon mentioned.
>>> df.groupby('ID', as_index=False).agg({'FromDate': 'min', 'ToDate': 'max'})
ID FromDate ToDate
0 95000 2017-06-12 2017-06-22
1 95001 2017-04-16 2017-04-22
2 95002 2017-10-13 2017-10-22
3 95003 2017-05-21 2017-05-27
4 95004 2017-02-11 2017-02-15
5 95005 2017-06-13 2017-06-18
6 95006 2017-03-07 2017-03-09
7 95007 2016-07-05 2017-04-22
8 95008 2015-05-18 2016-05-23
For 950007 I expect 2 records : first record 7/5 to 7/5 and second one 4/11 to 4/22
– user10633376
Nov 10 at 16:32
add a comment |
up vote
0
down vote
How about as @Jon mentioned.
>>> df.groupby('ID', as_index=False).agg({'FromDate': 'min', 'ToDate': 'max'})
ID FromDate ToDate
0 95000 2017-06-12 2017-06-22
1 95001 2017-04-16 2017-04-22
2 95002 2017-10-13 2017-10-22
3 95003 2017-05-21 2017-05-27
4 95004 2017-02-11 2017-02-15
5 95005 2017-06-13 2017-06-18
6 95006 2017-03-07 2017-03-09
7 95007 2016-07-05 2017-04-22
8 95008 2015-05-18 2016-05-23
For 950007 I expect 2 records : first record 7/5 to 7/5 and second one 4/11 to 4/22
– user10633376
Nov 10 at 16:32
add a comment |
up vote
0
down vote
up vote
0
down vote
How about as @Jon mentioned.
>>> df.groupby('ID', as_index=False).agg({'FromDate': 'min', 'ToDate': 'max'})
ID FromDate ToDate
0 95000 2017-06-12 2017-06-22
1 95001 2017-04-16 2017-04-22
2 95002 2017-10-13 2017-10-22
3 95003 2017-05-21 2017-05-27
4 95004 2017-02-11 2017-02-15
5 95005 2017-06-13 2017-06-18
6 95006 2017-03-07 2017-03-09
7 95007 2016-07-05 2017-04-22
8 95008 2015-05-18 2016-05-23
How about as @Jon mentioned.
>>> df.groupby('ID', as_index=False).agg({'FromDate': 'min', 'ToDate': 'max'})
ID FromDate ToDate
0 95000 2017-06-12 2017-06-22
1 95001 2017-04-16 2017-04-22
2 95002 2017-10-13 2017-10-22
3 95003 2017-05-21 2017-05-27
4 95004 2017-02-11 2017-02-15
5 95005 2017-06-13 2017-06-18
6 95006 2017-03-07 2017-03-09
7 95007 2016-07-05 2017-04-22
8 95008 2015-05-18 2016-05-23
answered Nov 10 at 16:02
pygo
1,478415
1,478415
For 950007 I expect 2 records : first record 7/5 to 7/5 and second one 4/11 to 4/22
– user10633376
Nov 10 at 16:32
add a comment |
For 950007 I expect 2 records : first record 7/5 to 7/5 and second one 4/11 to 4/22
– user10633376
Nov 10 at 16:32
For 950007 I expect 2 records : first record 7/5 to 7/5 and second one 4/11 to 4/22
– user10633376
Nov 10 at 16:32
For 950007 I expect 2 records : first record 7/5 to 7/5 and second one 4/11 to 4/22
– user10633376
Nov 10 at 16:32
add a comment |
1
Are you after:
df.groupby('ID', as_index=False).agg({'FromDate': 'min', 'ToDate': 'max'})
?– Jon Clements♦
Nov 10 at 15:39
I want to compare Row1.ToDAte = Row2.FromDate or Row2.FromDate-1. Only when that condition satisfies I want to do the group by. The number of rows to compare could be more than 2,3,4,5,6 or n. So simple group by does not work. So looking for help
– user10633376
Nov 10 at 16:03
1
Might want to edit your question to include that and maybe elaborate on it a bit as to why the other suggestions offered don't work. You can see there's already confusion over what you're asking. Please be as detailed as possible as to the logic you're after and how it ends up with the results you've specified.
– Jon Clements♦
Nov 10 at 16:04
Question is not clear , you need to make it explicit worth readable to get the better inputs, also would be great what you tried to achieve this so reader can get a clue..
– pygo
Nov 10 at 16:16