From adc88343ae1472c16e2953a2a24e2d6005e843e5 Mon Sep 17 00:00:00 2001 From: Eduardo Bonet <ebonet@gitlab.com> Date: Tue, 9 Jul 2024 08:22:22 +0000 Subject: [PATCH] Self-hosted models: create documentation index locally --- scripts/custom_models/create_index.py | 67 ++++++++++++++++++--------- 1 file changed, 46 insertions(+), 21 deletions(-) diff --git a/scripts/custom_models/create_index.py b/scripts/custom_models/create_index.py index 3bd1f22303c6..4384bc6254ea 100644 --- a/scripts/custom_models/create_index.py +++ b/scripts/custom_models/create_index.py @@ -25,11 +25,27 @@ logger = logging.getLogger(__name__) # Function to parse command-line arguments def parse_arguments(): - parser = argparse.ArgumentParser(description="Generate and upload GitLab docs index.") - parser.add_argument("--project_id", help="GitLab project ID", default=278964) - parser.add_argument("--version_tag", help="GitLab version tag to include in the URL (e.g., v17.1.0-ee)", - default='master') - parser.add_argument("--base_url", help="URL to gitlab instance", default="https://gitlab.com") + parser = argparse.ArgumentParser(description="Generate GitLab docs index.") + + parser.add_argument("-o", "--output_path", + help="Output path", + default="docs.db") + parser.add_argument("-d", "--download", + help="Downloads GitLab docs from a reference. If disabled, assumes docs are in /docs", + action='store_true') + parser.add_argument("--version_tag", + help="GitLab version tag to include in the URL (e.g., v17.1.0-ee). Only used when -d is set", + default="master") + parser.add_argument("-u", "--upload", + help='''Uploads documentation as a generic package to a registry defined by project_id and base_url. + Requires GLAB_TOKEN to be defined with a GitLab PAT with api scope''', + action='store_true') + parser.add_argument("--base_url", + help="URL to gitlab instance uploading. Only used when -u is set", + default="https://gitlab.com") + parser.add_argument("--project_id", + help="GitLab project ID. Only used when -u is set.", + default=278964) return parser.parse_args() @@ -126,10 +142,16 @@ def create_database(path, output_path): r['processed'] = build_row_corpus(r) # sql_tuples = [(r['processed'], r['content'], r['metadata']['filename']) for r in rows_to_insert if r['processed']] sql_tuples = [(r['processed'], r['content'], json.dumps(r['metadata'])) for r in rows_to_insert if r['processed']] + + if os.path.exists(output_path): + os.remove(output_path) + logger.info(f"Deleted existing file at {output_path}") + # Create the database conn = sqlite3.connect(output_path) c = conn.cursor() c.execute("CREATE VIRTUAL TABLE doc_index USING fts5(processed, content, metadata, tokenize='porter trigram');") + c.execute("PRAGMA user_version = 1;") c.executemany('INSERT INTO doc_index (processed, content, metadata) VALUES (?,?,?)', sql_tuples) conn.commit() conn.close() @@ -152,27 +174,30 @@ def upload_to_gitlab(upload_url, file_path, private_token): if __name__ == "__main__": args = parse_arguments() - private_token = os.environ['GLAB_TOKEN'] + if args.upload: + private_token = os.environ['GLAB_TOKEN'] - if not private_token: - execution_error("Private token must be set.") + if not private_token: + execution_error("Private token must be set.") - # Fetch documents based on version tag (if provided) - docs_path = fetch_documents(version_tag=args.version_tag) - if not docs_path: - execution_error("Fetching documents failed") + if args.download: + docs_path = fetch_documents(version_tag=args.version_tag) + if not docs_path: + execution_error("Fetching documents failed") + else: + docs_path = '' - # Create database - timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") - output_path = f"{docs_path}/created_index_docs_{timestamp}.db" + output_path = args.output_path create_database(docs_path, output_path) logger.info(f"Database created at {output_path}") - # Upload to GitLab - if not os.path.exists(output_path): - execution_error("Database file not found.") - url = upload_url(args.base_url, args.project_id, args.version_tag) + if args.upload: + # Upload to GitLab + if not os.path.exists(output_path): + execution_error("Database file not found.") + + url = upload_url(args.base_url, args.project_id, args.version_tag) - logger.info(f"Uploading to {url}") + logger.info(f"Uploading to {url}") - upload_to_gitlab(url, output_path, private_token) + upload_to_gitlab(url, output_path, private_token) -- GitLab