Skip to content

Commit 7e3a004

Browse files
committed
Validate data and config with test CSVs
1 parent 3985bd8 commit 7e3a004

4 files changed

Lines changed: 152 additions & 0 deletions

File tree

‎split_data.rb‎

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
#!/usr/bin/env ruby
2+
3+
# Restrict arguments to a specified class.
4+
require 'optparse'
5+
require 'csv'
6+
require 'yaml'
7+
8+
9+
QID_DEFAULT = 'holding_institution'
10+
AS_RECORDED_DEFAULT = 'holding_institution_as_recorded'
11+
DEFAULT_CONFIG = File.expand_path '../config.yml', __FILE__
12+
options = {
13+
qid: QID_DEFAULT,
14+
as_recorded: AS_RECORDED_DEFAULT,
15+
}
16+
17+
def validate_csv csv, options
18+
errors = []
19+
errors << validate_headers(csv, options)
20+
errors << validate_data(csv, options)
21+
errors.compact!
22+
return if errors.empty?
23+
raise %Q{Errors encountered:\n#{errors.join "\n"}}
24+
end
25+
26+
def validate_data csv, options
27+
missing_qid = []
28+
missing_name = []
29+
qid_col = options[:qid]
30+
as_recorded_col = options[:as_recorded]
31+
row_index = 0
32+
CSV.foreach csv, headers: true do |row|
33+
row_index += 1
34+
missing_qid << row_index if row[qid_col].to_s.strip.empty?
35+
missing_name << row_index if row[as_recorded_col].to_s.strip.empty?
36+
end
37+
38+
return if missing_qid.empty? && missing_name.empty?
39+
"Rows missing #{qid_col}: #{missing_qid.size}; and #{as_recorded_col}: #{missing_name.size}"
40+
end
41+
42+
def validate_headers csv, options
43+
headers = CSV.readlines(csv).first
44+
# binding.pry
45+
missing = %i{qid as_recorded}.flat_map { |k|
46+
headers.include?(options[k]) ? [] : "#{k}: '#{options[k]}'"
47+
}
48+
49+
return if missing.empty?
50+
51+
"Could not find required column(s) -- #{missing.join ', '}"
52+
end
53+
54+
def read_institutions csv, options
55+
data = {}
56+
qid_col = options[:qid]
57+
name_col = options[:as_recorded]
58+
CSV.foreach csv, headers: true do |row|
59+
qid = row[qid_col]
60+
next if qid.to_s.strip.empty?
61+
next if data[qid_col]
62+
as_recorded = row[name_col]
63+
next if as_recorded.to_s.strip.empty?
64+
data[qid] = as_recorded
65+
end
66+
data
67+
end
68+
69+
def check_config config, csv, options
70+
csv_institutions = read_institutions csv, options
71+
# binding.pry
72+
missing = csv_institutions.keys.map { |qid|
73+
next if config.any? { |inst| inst[:qid] == qid }
74+
[qid, csv_institutions[qid]]
75+
}.compact
76+
return if missing.empty?
77+
78+
message = missing.map { |pair|
79+
{
80+
qid: pair.first,
81+
name: (pair[1] || 'REPLACE_WITH_INST_NAME'),
82+
directory: 'REPLACE_WITH_DIR_NAME'
83+
}
84+
}.to_yaml.lines[1..-1].join
85+
STDERR.puts "CSV has institutions not in the config."
86+
STDERR.puts "Complete the following lines and add them to config.yml"
87+
STDERR.puts
88+
STDERR.puts message
89+
STDERR.puts
90+
raise "Error: config.yml is missing values"
91+
end
92+
93+
def validate_config config
94+
# QIDs and folders must be unique
95+
qids = Hash.new { |hash,key| hash[key] = 0 }
96+
dirs = Hash.new { |hash,key| hash[key] = 0 }
97+
config.each do |inst|
98+
qids[inst[:qid]] += 1
99+
dirs[inst[:directory]] += 1
100+
end
101+
dupes = []
102+
qids.each do |qid, count|
103+
dupes << "QID: #{qid}, couunt: #{count}" if count > 1
104+
end
105+
dirs.each do |dir, count|
106+
dupes << "Directory: #{dir}, couunt: #{count}" if count > 1
107+
end
108+
return if dupes.empty?
109+
STDERR.puts "Invalid config.yml; the following values are duplicated:"
110+
STDERR.puts
111+
STDERR.puts dupes.join "\n"
112+
STDERR.puts
113+
raise "Invalid config; see errors above"
114+
end
115+
116+
ARGV.options do |opts|
117+
opts.banner = "Usage: #{File.basename __FILE__} [OPTIONS] CSV_TO_SPLIT"
118+
119+
q_msg = %Q{Institution QID column; default: #{QID_DEFAULT} }
120+
opts.on '-q', '--qid-column COLUMN', q_msg do |qid|
121+
options[:qid] = qid
122+
end
123+
124+
a_msg = %Q{Institution 'as recorded' column; default: #{AS_RECORDED_DEFAULT} }
125+
opts.on '-a', '--as-recorded-column COLUMN', a_msg do |as_recorded|
126+
options[:as_recorded] = as_recorded
127+
end
128+
129+
opts.parse!
130+
end
131+
132+
# read the config and fail if it's not present
133+
abort "Cannot find find config file #{DEFAULT_CONFIG}" unless File.exist? DEFAULT_CONFIG
134+
config = YAML.load_file DEFAULT_CONFIG
135+
csv = ARGV.shift
136+
137+
validate_config config
138+
validate_csv csv, options
139+
check_config config, csv, options

‎test/missing_inst_name.csv‎

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
ds_id,date_added,date_last_updated,source_type,holding_institution,holding_institution_as_recorded,holding_institution_id_number,link_to_holding_institution_record,iiif_manifest,production_place_as_recorded,production_place,production_date_as_recorded,production_date,century,century_aat,dated,uniform_title,uniform_title_as_recorded,uniform_title_agr,title_as_recorded_245,title_as_recorded_245_agr,genre_as_recorded,genre_as_recorded_lcsh,genre_as_recorded_aat,genre_as_recorded_rbprov,genre_as_recorded_lcgft,genre,named_subject_as_recorded,subject_as_recorded,subject,author_as_recorded,author_as_recorded_agr,author,artist_as_recorded,artist_as_recorded_agr,artist,scribe_as_recorded,scribe_as_recorded_agr,scribe,language_as_recorded,language,former_owner_as_recorded,former_owner_as_recorded_agr,former_owner,material,material_placeholder,physical_description,acknowledgements,data_processed_at,data_source_modified,source_file
2+
,,,digital-scriptorium,Q30257935,,CA 01,"",,France?,"","s. IX/X, 890-910",890^910,9;10,http://vocab.getty.edu/aat/300404501;http://vocab.getty.edu/aat/300404502,false,,"","","Bible, O.T.","","",,,,,,,"",,"","",,"","",,"","",,Latin,"",Brought from the library of Engelberg Abbey in Switzerland to Conception Abbey sometime before 1900.,"",,"",parchment,Extent: f. 1r-v; 212 x 142 mm.,We thank Michael W. Heil for his work in making this description available.,2022-04-12T18:29:27-04:00,2021-10-01,/Users/emeryr/code/GIT/ds-scripts/scripts/../data/digitalassets.lib.berkeley.edu/ds/conception/mets/ds_50_15_00132322.xml
3+
,,,digital-scriptorium,Q30257935,Conception Abbey and Seminary,CA 02,"",,Germany?,"","s. X, 900-999",900^999,10,http://vocab.getty.edu/aat/300404502,false,,"","",Sacramentary,"","",,,,,,,"",,"","",,"","",,"","",,Latin,"",Brought from the library of Engelberg Abbey in Switzerland to Conception Abbey sometime before 1900.,"",,"",parchment,Extent: f. 1r-v; 180 x 138 mm.,We thank Michael W. Heil for his work in making this description available.,2022-04-12T18:29:27-04:00,2021-10-01,/Users/emeryr/code/GIT/ds-scripts/scripts/../data/digitalassets.lib.berkeley.edu/ds/conception/mets/ds_50_15_00132323.xml
4+
,,,digital-scriptorium,Q30257935,Conception Abbey and Seminary,CA 19,"",,?,"",Undetermined,"",,,false,,"","","Decretals of Gregory IX, commentary on","","",,,,,,,"",,"","",,"","",,"","",,Latin,"",Brought from the library of Engelberg Abbey in Switzerland to Conception Abbey sometime before 1900.,"",,"",parchment,Extent: ff. 1-2v; 162 x 152 mm.,We thank Michael W. Heil for his work in making this description available.,2022-04-12T18:29:27-04:00,2021-10-01,/Users/emeryr/code/GIT/ds-scripts/scripts/../data/digitalassets.lib.berkeley.edu/ds/conception/mets/ds_50_15_00132324.xml
5+
,,,digital-scriptorium,Q30257935,Conception Abbey and Seminary,CA 35,"",,France?,"","s. XIV, 1300-1399",1300^1399,14,http://vocab.getty.edu/aat/300404506,false,,"","",Meteorologica,"","",,,,,,,"",,"","",,"","",,"","",,Latin,"","Bequeathed to Conception Abbey by Dr. Charles D. Humberd of Barnard, Missouri.","",,"",parchment,Extent: ff. 1-2v; 270 x 215 mm.,We thank Michael W. Heil for his work in making this description available.,2022-04-12T18:29:27-04:00,2021-10-01,/Users/emeryr/code/GIT/ds-scripts/scripts/../data/digitalassets.lib.berkeley.edu/ds/conception/mets/ds_50_15_00132325.xml

‎test/missing_qid.csv‎

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
ds_id,date_added,date_last_updated,source_type,holding_institution,holding_institution_as_recorded,holding_institution_id_number,link_to_holding_institution_record,iiif_manifest,production_place_as_recorded,production_place,production_date_as_recorded,production_date,century,century_aat,dated,uniform_title,uniform_title_as_recorded,uniform_title_agr,title_as_recorded_245,title_as_recorded_245_agr,genre_as_recorded,genre_as_recorded_lcsh,genre_as_recorded_aat,genre_as_recorded_rbprov,genre_as_recorded_lcgft,genre,named_subject_as_recorded,subject_as_recorded,subject,author_as_recorded,author_as_recorded_agr,author,artist_as_recorded,artist_as_recorded_agr,artist,scribe_as_recorded,scribe_as_recorded_agr,scribe,language_as_recorded,language,former_owner_as_recorded,former_owner_as_recorded_agr,former_owner,material,material_placeholder,physical_description,acknowledgements,data_processed_at,data_source_modified,source_file
2+
,,,digital-scriptorium,,Conception Abbey and Seminary,CA 01,"",,France?,"","s. IX/X, 890-910",890^910,9;10,http://vocab.getty.edu/aat/300404501;http://vocab.getty.edu/aat/300404502,false,,"","","Bible, O.T.","","",,,,,,,"",,"","",,"","",,"","",,Latin,"",Brought from the library of Engelberg Abbey in Switzerland to Conception Abbey sometime before 1900.,"",,"",parchment,Extent: f. 1r-v; 212 x 142 mm.,We thank Michael W. Heil for his work in making this description available.,2022-04-12T18:29:27-04:00,2021-10-01,/Users/emeryr/code/GIT/ds-scripts/scripts/../data/digitalassets.lib.berkeley.edu/ds/conception/mets/ds_50_15_00132322.xml
3+
,,,digital-scriptorium,Q30257935,Conception Abbey and Seminary,CA 02,"",,Germany?,"","s. X, 900-999",900^999,10,http://vocab.getty.edu/aat/300404502,false,,"","",Sacramentary,"","",,,,,,,"",,"","",,"","",,"","",,Latin,"",Brought from the library of Engelberg Abbey in Switzerland to Conception Abbey sometime before 1900.,"",,"",parchment,Extent: f. 1r-v; 180 x 138 mm.,We thank Michael W. Heil for his work in making this description available.,2022-04-12T18:29:27-04:00,2021-10-01,/Users/emeryr/code/GIT/ds-scripts/scripts/../data/digitalassets.lib.berkeley.edu/ds/conception/mets/ds_50_15_00132323.xml
4+
,,,digital-scriptorium,Q30257935,Conception Abbey and Seminary,CA 19,"",,?,"",Undetermined,"",,,false,,"","","Decretals of Gregory IX, commentary on","","",,,,,,,"",,"","",,"","",,"","",,Latin,"",Brought from the library of Engelberg Abbey in Switzerland to Conception Abbey sometime before 1900.,"",,"",parchment,Extent: ff. 1-2v; 162 x 152 mm.,We thank Michael W. Heil for his work in making this description available.,2022-04-12T18:29:27-04:00,2021-10-01,/Users/emeryr/code/GIT/ds-scripts/scripts/../data/digitalassets.lib.berkeley.edu/ds/conception/mets/ds_50_15_00132324.xml

‎test/unknown_qid.csv‎

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
ds_id,date_added,date_last_updated,source_type,holding_institution,holding_institution_as_recorded,holding_institution_id_number,link_to_holding_institution_record,iiif_manifest,production_place_as_recorded,production_place,production_date_as_recorded,production_date,century,century_aat,dated,uniform_title,uniform_title_as_recorded,uniform_title_agr,title_as_recorded_245,title_as_recorded_245_agr,genre_as_recorded,genre_as_recorded_lcsh,genre_as_recorded_aat,genre_as_recorded_rbprov,genre_as_recorded_lcgft,genre,named_subject_as_recorded,subject_as_recorded,subject,author_as_recorded,author_as_recorded_agr,author,artist_as_recorded,artist_as_recorded_agr,artist,scribe_as_recorded,scribe_as_recorded_agr,scribe,language_as_recorded,language,former_owner_as_recorded,former_owner_as_recorded_agr,former_owner,material,material_placeholder,physical_description,acknowledgements,data_processed_at,data_source_modified,source_file
2+
,,,digital-scriptorium,Q30257935x,Conception Abbey and Seminary,CA 01,"",,France?,"","s. IX/X, 890-910",890^910,9;10,http://vocab.getty.edu/aat/300404501;http://vocab.getty.edu/aat/300404502,false,,"","","Bible, O.T.","","",,,,,,,"",,"","",,"","",,"","",,Latin,"",Brought from the library of Engelberg Abbey in Switzerland to Conception Abbey sometime before 1900.,"",,"",parchment,Extent: f. 1r-v; 212 x 142 mm.,We thank Michael W. Heil for his work in making this description available.,2022-04-12T18:29:27-04:00,2021-10-01,/Users/emeryr/code/GIT/ds-scripts/scripts/../data/digitalassets.lib.berkeley.edu/ds/conception/mets/ds_50_15_00132322.xml
3+
,,,digital-scriptorium,Q30257935,Conception Abbey and Seminary,CA 02,"",,Germany?,"","s. X, 900-999",900^999,10,http://vocab.getty.edu/aat/300404502,false,,"","",Sacramentary,"","",,,,,,,"",,"","",,"","",,"","",,Latin,"",Brought from the library of Engelberg Abbey in Switzerland to Conception Abbey sometime before 1900.,"",,"",parchment,Extent: f. 1r-v; 180 x 138 mm.,We thank Michael W. Heil for his work in making this description available.,2022-04-12T18:29:27-04:00,2021-10-01,/Users/emeryr/code/GIT/ds-scripts/scripts/../data/digitalassets.lib.berkeley.edu/ds/conception/mets/ds_50_15_00132323.xml
4+
,,,digital-scriptorium,Q30257935,Conception Abbey and Seminary,CA 19,"",,?,"",Undetermined,"",,,false,,"","","Decretals of Gregory IX, commentary on","","",,,,,,,"",,"","",,"","",,"","",,Latin,"",Brought from the library of Engelberg Abbey in Switzerland to Conception Abbey sometime before 1900.,"",,"",parchment,Extent: ff. 1-2v; 162 x 152 mm.,We thank Michael W. Heil for his work in making this description available.,2022-04-12T18:29:27-04:00,2021-10-01,/Users/emeryr/code/GIT/ds-scripts/scripts/../data/digitalassets.lib.berkeley.edu/ds/conception/mets/ds_50_15_00132324.xml

0 commit comments

Comments
 (0)