{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from flashtext import KeywordProcessor\n", "import pandas as pd\n", "from pathlib import Path\n", "from collections import defaultdict\n", "from IPython.display import display, HTML" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[WindowsPath('../data/v1/WikiCSSH_categories.csv'),\n", " WindowsPath('../data/v1/WikiCSSH_category2page.csv'),\n", " WindowsPath('../data/v1/WikiCSSH_category_links.csv'),\n", " WindowsPath('../data/v1/WikiCSSH_category_links_all.csv'),\n", " WindowsPath('../data/v1/Wikicssh_core_categories.csv'),\n", " WindowsPath('../data/v1/WikiCSSH_page2redirect.csv')]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wikicssh_path = Path(\"../data/v1\")\n", "wikicssh_files = list(wikicssh_path.glob(\"./*.csv\"))\n", "wikicssh_files" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 20.4 s\n" ] } ], "source": [ "%%time\n", "page2cats = (\n", " pd.read_csv('../data/v1/WikiCSSH_category2page.csv')\n", " .groupby(\"page_title\")\n", " .cat_title\n", " .agg(lambda x: list(x))\n", " .to_dict()\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | category | \n", "level | \n", "
---|---|---|
0 | \n", "Computer_science | \n", "1 | \n", "
1 | \n", "Mathematics | \n", "1 | \n", "
2 | \n", "Information_science | \n", "1 | \n", "
3 | \n", "Computer_engineering | \n", "1 | \n", "
4 | \n", "Statistics | \n", "1 | \n", "