To support this question here: https://stackoverflow.com/questions/48633906/comparing-two-string-columns-python
The sample code below provides the ability to import a csv file with a list of last names, the year, and a second list of last names. The code then performs a match in the two lists and increments the number of matches per year. The output of the last name matches per year is a bar graph.
import os import tkinter as tk from tkinter import filedialog import csv import numpy as np import matplotlib.mlab as mlab import matplotlib.pyplot as plt def findLastNameMatch(): #create empty lists to separate csv row values lastName1 = [] year = [] lastName2 = [] #create empty lists for order pairs l1YrOP = [] yrCountOP = [] #open csv file that contains the three columns #column names must match the names listed in the row below file_path = filedialog.askopenfilename() print("\n") #print file path for validation print(file_path) print("\n") #open csv file with open(file_path, 'r') as csvfile: reader = csv.DictReader(csvfile) #loop through csv file for row in reader: #print columns for validation print(row['LastName1'], row['Year'], row['LastName2']) #push back values into the appropriate list lastName1.append(row['LastName1']) year.append(row['Year']) lastName2.append(row['LastName2']) #count elements in each list to validate that they match print("\n") print("The number of elements in lastName1 is " + str(len(lastName1)) + "\n") print("The number of elements in lastName1 is " + str(len(year)) + "\n") print("The number of elements in lastName1 is " + str(len(lastName2)) + "\n") #create orderd pair lists for year and count (count will start at zero) for c in year: yrCountOP.append([c,0]) print("\n") for d in yrCountOP: print(d) print("\n") #create ordered pair list between lastName1 and year for a,b in zip(year,lastName1): list1PlusYr = a + "," + str(b) l1YrOP.append(list1PlusYr) for z in l1YrOP: print(z) print("\n") #find last name list matches between column 1 and 3 and retun the year and return corresponding index for y in lastName2: for x in lastName1: if x == y: print(x + " is a match") print("Last Name List 1 Index of match is: " + str(lastName1.index(x)) + " : Last Name: " + x) print("\n") totalVal = len(yrCountOP) iteration = 0 #match returned index to year ordered pair and increment second pair in year list for a,b in yrCountOP: #validate ability to return index values # print("Current iteration: " + str(iteration)) #match index value if iteration == lastName1.index(x): print("index match found: " + str(iteration)) yrCountOP[iteration] = [a,b+1] iteration = iteration + 1 # print("New iteration: " + str(iteration)) print("\n") #validate change in order pair for g in yrCountOP: print(g) yVal = [] xVal = [] for a,b in yrCountOP: yVal.append(b) xVal.append(a) print("\n") #create histogram with y coordinate is number of matches and x coordinate the years plt.xlabel('Years') plt.ylabel('Number of Last Name Matches') plt.title(r'Last Name Matches per Year') x = np.arange(totalVal) plt.bar(x, height= yVal) plt.xticks(x, xVal) plt.show() #Match the year findLastNameMatch()