import pandas as pd


def maybe_sort_df(df):
    df = df.sort_values(by="did", kind="stable", ignore_index=True)
    df.to_csv("generate/results/WD_YDLJ.csv", index=False)


dfi = pd.read_csv("generate/original_dataset/原始裁判文书/index.csv")
if dfi.isna().values.any():
    print("\033[31mMissing data detected:\033[0m")
    print(dfi[dfi.isna().any(axis=1)])
else:
    print("\033[92mConfirm no missing data\033[0m")
if dfi["id"].is_unique:
    print("\033[92mConfirm ids are unique\033[0m")
else:
    print("\033[31mDuplicate ids detected:\033[0m")
    print(dfi[dfi["id"].duplicated()])
missing_ids = set(range(len(dfi))).difference(set(dfi["id"]))
if missing_ids:
    print("\033[31mMissing ids detected:\033[0m")
    print(" ".join(map(str, sorted(missing_ids))))
else:
    print(f"\033[92mConfirm no missing ids (0 ~ {len(dfi) - 1})\033[0m")
if dfi["title"].is_unique:
    print("\033[92mConfirm titles are unique\033[0m")
else:
    print("\033[31mDuplicate titles detected:\033[0m")
    print(dfi[dfi["title"].duplicated()])

Confirm no missing data
Confirm ids are unique
Confirm no missing ids (0 ~ 499)
Confirm titles are unique


dfi_grp = dfi.groupby(["type", "level", "year"]).size().to_frame(name="count")
dfi_grp_macro = dfi.groupby("type").size().to_frame(name="count")
dfi_grp_macro["%"] = dfi_grp_macro["count"] / dfi_grp_macro["count"].sum()
dfi_grp_macro["%"] = dfi_grp_macro["%"].apply("{:.2%}".format)
dfi_grp_macro


ind = pd.MultiIndex.from_product(
    [("刑事", "民事"), ("基层法院", "中级法院", "高级法院", "最高法院"), (2022, 2023)],
    names=["type", "level", "year"]
)
df_ref = pd.DataFrame(index=ind, columns=["count"])
df_ref.loc[("刑事", "基层法院", 2022)] = 89185
df_ref.loc[("刑事", "基层法院", 2023)] = 14191
df_ref.loc[("刑事", "中级法院", 2022)] = 82758
df_ref.loc[("刑事", "中级法院", 2023)] = 11984
df_ref.loc[("刑事", "高级法院", 2022)] = 975
df_ref.loc[("刑事", "高级法院", 2023)] = 61
df_ref.loc[("刑事", "最高法院", 2022)] = 82
df_ref.loc[("刑事", "最高法院", 2023)] = 3
df_ref.loc[("民事", "基层法院", 2022)] = 4513220
df_ref.loc[("民事", "基层法院", 2023)] = 787645
df_ref.loc[("民事", "中级法院", 2022)] = 483979
df_ref.loc[("民事", "中级法院", 2023)] = 94326
df_ref.loc[("民事", "高级法院", 2022)] = 26942
df_ref.loc[("民事", "高级法院", 2023)] = 4452
df_ref.loc[("民事", "最高法院", 2022)] = 278
df_ref.loc[("民事", "最高法院", 2023)] = 2636


df_ref = df_ref.reset_index()
df_ref = df_ref.groupby(["type", "level"], as_index=False)["count"].sum()
df_ref_subtypes = {}
for tp in ("民事", "刑事"):
    df_ref_subtype = df_ref[df_ref["type"] == tp].drop("type", axis=1).set_index("level")
    df_ref_subtype["%"] = df_ref_subtype["count"] / df_ref_subtype["count"].sum()
    df_ref_subtype = df_ref_subtype.drop("count", axis=1)
    df_ref_subtypes[tp] = df_ref_subtype


dfi_comps = {}
for tp in ("民事", "刑事"):
    dfi_comp = dfi_grp.reset_index()
    dfi_comp = dfi_comp[dfi_comp["type"] == tp]
    dfi_comp = dfi_comp.groupby("level", as_index=False)["count"].sum()
    dfi_comp = dfi_comp.set_index("level")
    dfi_comp["%"] = (dfi_comp["count"] / dfi_comp["count"].sum())
    dfi_comp = dfi_comp.join(df_ref_subtypes[tp], how="outer", rsuffix=" (ref)")
    dfi_comp = dfi_comp.fillna(0).astype({"count": int})
    dfi_comp["%"] = dfi_comp["%"].apply("{:.2%}".format)
    dfi_comp["% (ref)"] = dfi_comp["% (ref)"].apply("{:.2%}".format)
    dfi_comp["type"] = tp
    dfi_comp["lack"] = dfi_comp["%"] < dfi_comp["% (ref)"]
    dfi_comp = dfi_comp.reset_index().set_index(["type", "level"])
    dfi_comps[tp] = dfi_comp


dfi_comps["刑事"]


dfi_comps["民事"]


df = pd.read_csv("generate/results/WD_YDLJ.csv")
print(f"Constructed data: {len(df)}")
missing_cons_dids = set(range(len(dfi))).difference(set(df["did"]))
if missing_cons_dids:
    missing_cons_dids_str = " ".join(map(str, sorted(missing_cons_dids)))
    print(f"\033[31mDetected {len(missing_cons_dids)} datasets not constructed:\033[0m")
    print("> python generate/clean_wenshu_all.py")
    print(f"> python generate/WD_YDLJ.py -p -d {missing_cons_dids_str}")
else:
    print("\033[92mConfirm all existing datasets constructed\033[0m")
df

Constructed data: 9272
Confirm all existing datasets constructed


bad_update_df = df[(df["question"].str.contains("无更新")) | (df["answer"].str.contains("无更新"))]
if len(bad_update_df) != 0:
    print(f"Incorrect update: {len(bad_update_df)} ({len(bad_update_df) / len(df):.3%}) out of {len(df)}")
else:
    print(f"\033[92mNo obviously incorrect data out of {len(df)}\033[0m")
bad_update_df if len(bad_update_df) != 0 else None

No obviously incorrect data out of 9272

	did	question	answer
0	0	被告和原告分别是谁？	一审被告为东莞市九鲨电子科技有限公司、深圳市尚品博林科技有限公司和谈杰，一审原告为蔡双双。
1	0	蔡双双的诉讼代理人是谁？	蔡双双的诉讼代理人是吴军，来自广东深科律师事务所。
2	0	本案涉及的法律问题是什么？	本案涉及侵害实用新型专利权的纠纷。
3	0	本案的一审判决结果是什么？	广东省深圳市中级人民法院作出的一审判决结果是（2021）粤03民初513号民事判决。
4	0	本案二审的案件受理费应由哪方负担？	本案一审案件受理费由蔡双双负担；二审案件受理费由东莞市九鲨电子科技有限公司负担。
...	...	...	...
9267	499	本案审理采用什么程序？	本案适用刑事案件速裁程序，实行独任审判。
9268	499	被告人挪用资金的数额是多少？	被告人邵勇挪用客户向被害单位支付的定金、货款共计人民币165450元，其中117720元用于...
9269	499	被告人是否认罪认罚？	被告人邵勇在值班律师在场的情况下签署认罪认罚具结书并自愿认罪、同意量刑建议和程序适用。
9270	499	被告人是否有自首情节？	是的，被告人邵勇犯罪以后自动投案，如实供述自己的罪行，被判从轻处罚。
9271	499	本判决可以上诉吗？	是的，如果不服本判决，可以在接到本判决书的第二日起十日内，通过本院或者直接向江苏省苏州市中级...

司法阅读理解（500文书版）¶

检查原始文书数据集¶

原始数据集中案由分布¶

原始数据集中各个案由下法院层级分布¶

指令集构造¶

		count	%	% (ref)	lack
type	level
刑事	中级法院	96	39.67%	47.55%	True
	基层法院	126	52.07%	51.89%	False
	最高法院	0	0.00%	0.04%	True
	高级法院	20	8.26%	0.52%	False

		count	%	% (ref)	lack
type	level
民事	中级法院	25	9.69%	9.78%	True
	基层法院	213	82.56%	89.64%	True
	最高法院	20	7.75%	0.05%	False
	高级法院	0	0.00%	0.53%	True

	count	%
type
刑事	242	48.40%
民事	258	51.60%