Write a command line program to "join" .csv files. Use any programming language you're comfortable with. Your program should work similarly to the unix "join" utility (google for it). Unlike the unix join, your program will not require files to be sorted on the key. Your program must also accept the "type" of join to use---merge join, inner loop join, or hash join, etc. Test your program on "large" files (e.g. make sure it doesn't blow up on one million records, etc.)

#Submit source code for the program.

#Also... load all files in ctsdata.20140211.tar (link on the left) into Oracle or Postgres (or whichever works for you). The format of these files is: cts(date,symbol,open,high,low,close,volume), splits(date,symbol,post,pre), dividend(date,symbol,dividend). Submit (email) whatever commands/files you used to load the data into whatever database you're using, as well as the raw space usage of the tables in your database.

def hashJoin(table1, index1, table2, index2):
  h = defaultdict(list)
  #hash phase
  for s in table1:
    h[s[index1]].append(s)
  #join phase
  return[(s, r) for r in table2 for s in h[r[index2]]]
 
  table1 = [(13, "OGBONNAYA"),
            (23, "OGB"),
            (24, "OG"),
            (6, "OGBO"),
            (8, "O")]
           
 
  table2 = [(" OGBONNAYA", "Jenny"),
            ("OGB", "Michelle"),
            ("OG", "Halsey"),
            (("OGBO", "Carly"),
            "O", "Caroline")]
           
  for row in hashJoin(table1, 1, table2, 0):
    print(row)
   
  import csv
  #First dictionary
  first_dict = {}
  with open(first_file, 'r') as f:
    csvReader = csv.reader(f)
    next(csvReader, None) #skip the heeder
    for row in csvReader:
      key = row(0)
      first_dict[key] = row[1]
  #Second dictionary
  second_dict = {}
  with open('second_file', 'r') as f:
    csvReader = csv.reader(f)
    naxt(csvReader, None) #skip the header
    for row in csvReader:
      key = " ".join(row[:3]).replace(" "," ")
      second_dict[key] = row[4]
     
 
  import csv
  from collections import OrderedDict
 
  with open('second_file.csv', 'rb') as f:
    r = csv.reader(f)
    dict2 = {row[0]: row[1:] for row in r}
   
  with open('first_file.csv', 'rb') as f:
    r = csv.reader(f)
    dict1 = OrderedDict((row[0], row[1:]) for row in r)
   
  result = OrderedDict()
  for d in (dict1, dict2):
    for key, value in d.iteritems():
      result.setdefault(key, []) .extend(value)
     
  with open('ab_combines.csv', 'wb') as f:
    w = csv.writer(f)
    for key, value in result.iteritems():
 
  #merge join
 
 #import time, csv
 
def link():
    first = open('first_file.csv')
    csv_file = csv.reader(first, delimiter = "|")
    second = open('second_file.csv')
    csv_file2 = csv.reader(second, delimiter = "|")
   
  list = []
  for row in csv_file2:
    list.append(row)
 
  for row in csv_file:
    match = False
    for secrow in list:
      if row[0].replace(" ", " ") == secrow[0].replace(" ", " "):
        print(row[0] + "," + row[1] + "," + secrow[1])
      if not match:
        print([0] + "," + row[1] + ", blank no match")
      time.sleep(1)
   
    w.writerow([key] + value)     
 
   
   
   
   
   
   
   
   
   
 
 

Comments

Popular posts from this blog