To support this question here: https://stackoverflow.com/questions/48633906/comparing-two-string-columns-python
The sample code below provides the ability to import a csv file with a list of last names, the year, and a second list of last names. The code then performs a match in the two lists and increments the number of matches per year. The output of the last name matches per year is a bar graph.
import os
import tkinter as tk
from tkinter import filedialog
import csv
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
def findLastNameMatch():
#create empty lists to separate csv row values
lastName1 = []
year = []
lastName2 = []
#create empty lists for order pairs
l1YrOP = []
yrCountOP = []
#open csv file that contains the three columns
#column names must match the names listed in the row below
file_path = filedialog.askopenfilename()
print("\n")
#print file path for validation
print(file_path)
print("\n")
#open csv file
with open(file_path, 'r') as csvfile:
reader = csv.DictReader(csvfile)
#loop through csv file
for row in reader:
#print columns for validation
print(row['LastName1'], row['Year'], row['LastName2'])
#push back values into the appropriate list
lastName1.append(row['LastName1'])
year.append(row['Year'])
lastName2.append(row['LastName2'])
#count elements in each list to validate that they match
print("\n")
print("The number of elements in lastName1 is " + str(len(lastName1)) + "\n")
print("The number of elements in lastName1 is " + str(len(year)) + "\n")
print("The number of elements in lastName1 is " + str(len(lastName2)) + "\n")
#create orderd pair lists for year and count (count will start at zero)
for c in year:
yrCountOP.append([c,0])
print("\n")
for d in yrCountOP:
print(d)
print("\n")
#create ordered pair list between lastName1 and year
for a,b in zip(year,lastName1):
list1PlusYr = a + "," + str(b)
l1YrOP.append(list1PlusYr)
for z in l1YrOP:
print(z)
print("\n")
#find last name list matches between column 1 and 3 and retun the year and return corresponding index
for y in lastName2:
for x in lastName1:
if x == y:
print(x + " is a match")
print("Last Name List 1 Index of match is: " + str(lastName1.index(x)) + " : Last Name: " + x)
print("\n")
totalVal = len(yrCountOP)
iteration = 0
#match returned index to year ordered pair and increment second pair in year list
for a,b in yrCountOP:
#validate ability to return index values
# print("Current iteration: " + str(iteration))
#match index value
if iteration == lastName1.index(x):
print("index match found: " + str(iteration))
yrCountOP[iteration] = [a,b+1]
iteration = iteration + 1
# print("New iteration: " + str(iteration))
print("\n")
#validate change in order pair
for g in yrCountOP:
print(g)
yVal = []
xVal = []
for a,b in yrCountOP:
yVal.append(b)
xVal.append(a)
print("\n")
#create histogram with y coordinate is number of matches and x coordinate the years
plt.xlabel('Years')
plt.ylabel('Number of Last Name Matches')
plt.title(r'Last Name Matches per Year')
x = np.arange(totalVal)
plt.bar(x, height= yVal)
plt.xticks(x, xVal)
plt.show()
#Match the year
findLastNameMatch()