diff --git a/src/imdb_datasets_worker.py b/src/imdb_datasets_worker.py index d743bbf..17ca16e 100644 --- a/src/imdb_datasets_worker.py +++ b/src/imdb_datasets_worker.py @@ -16,6 +16,8 @@ def convert_tsv_to_db(title_basics_tsv): with gzip.open(title_basics_tsv, mode='rt') as file: write_dataset = [] counter = 0 + chunk = 1000 + progress_counter = 0 for line in file: line = line.split("\t") try: @@ -24,23 +26,23 @@ def convert_tsv_to_db(title_basics_tsv): original_name = line[3] ru_name = None year = line[5] - - if tt_type not in ("movie", "video"): - original_name = None - year = "\\N" - else: - print(tt_id, tt_type, original_name, ru_name, year) - - if year == "\\N": + if year.startswith(r"\N"): year = None else: year = int(year) + + if tt_type not in ("movie", "video"): + original_name = None + year = None + write_dataset.append((tt_id, tt_type, original_name, ru_name, year)) counter += 1 - if counter >= 1000: + if counter >= chunk: c.executemany("INSERT OR REPLACE INTO titles(tt_id, type, original_name, ru_name, year) VALUES (?, ?, ?, ?, ?)", write_dataset) write_dataset = [] counter = 0 + progress_counter += chunk + print(f'Обработано: {progress_counter}') except Exception as E: print(E) pass @@ -50,6 +52,7 @@ def extract_ru_locale_from_tsv(title_akas_tsv): '''Конвертирование датасета с локализованными названиями и последующее добавление в базу''' with gzip.open(title_akas_tsv, mode='rt') as file: ru_name_writer = [] + counter = 0 for line in file: line = line.split("\t") try: @@ -62,8 +65,9 @@ def extract_ru_locale_from_tsv(title_akas_tsv): if tt_type not in ("movie", "video"): continue ru_name = line[2] - print(ru_name, tt_type) ru_name_writer.append((ru_name, tt_id)) + counter += 1 + print(f'Обработано ru_name: {counter}') except Exception as E: print(E)