df=df.drop(columns=drop_columns).sort_values('sha256 hash')# clear not needed colums & sort by hash
duplicate_hash=df.loc[df.duplicated(subset=['sha256 hash'],keep=False),:]# all files with duplicate hash - incl. files from the same student id
hash_with_multiple_student_ids=duplicate_hash.groupby('sha256 hash').agg(lambdax:len(x.unique())>1)# true if more than 1 unique student ids (= multiple student ids with same hash), false if unique (= same student id re-submitting with the same hash)
suspicious_hashes_list=hash_with_multiple_student_ids[hash_with_multiple_student_ids['Student ID']==True].index.to_list()# list with duplicate hashes - only if different student id (doesn't include attempts from same student id)