import json, re, time, urllib.parse, urllib.request, urllib.error
from collections import Counter, defaultdict

UA={'User-Agent':'HermesResearch/1.0 Miguel EDC festival prep'}
BASE='https://www.reddit.com'
SUB='electricdaisycarnival'

def get_json(url, retries=4):
    for i in range(retries):
        try:
            req=urllib.request.Request(url,headers=UA)
            with urllib.request.urlopen(req,timeout=25) as r:
                return json.loads(r.read().decode('utf-8'))
        except urllib.error.HTTPError as e:
            if e.code in (429,500,502,503,504):
                time.sleep(2+i*3); continue
            raise
        except Exception:
            if i==retries-1: raise
            time.sleep(1+i)

def listing(path, limit_total=150):
    out=[]; after=None
    while len(out)<limit_total:
        qs={'limit':min(100,limit_total-len(out)),'raw_json':1}
        if after: qs['after']=after
        url=BASE+path+('&' if '?' in path else '?')+urllib.parse.urlencode(qs)
        data=get_json(url)
        children=data['data'].get('children',[])
        if not children: break
        out.extend([c['data'] for c in children if c.get('kind')=='t3'])
        after=data['data'].get('after')
        if not after: break
        time.sleep(.35)
    return out

posts=[]
paths=[]
for sort in ['hot','new']:
    paths.append((f'/r/{SUB}/{sort}.json', 100))
for t in ['week','month','year','all']:
    paths.append((f'/r/{SUB}/top.json?t={t}', 100))
queries=[
    'what to bring', 'packing list', 'essentials', 'must bring', 'first time',
    'locker', 'hydration pack', 'camelbak', 'shoes', 'earplugs', 'portable charger',
    'bag policy', 'allowed items', 'prohibited', 'totem', 'kandi', 'gum', 'vicks',
    'weather', 'cold', 'hot', 'shuttle', 'camping', 'security', 'tips'
]
for q in queries:
    paths.append((f'/r/{SUB}/search.json?q={urllib.parse.quote(q)}&restrict_sr=on&sort=relevance&t=year', 50))
    paths.append((f'/r/{SUB}/search.json?q={urllib.parse.quote(q)}&restrict_sr=on&sort=comments&t=year', 50))

for path,n in paths:
    try:
        batch=listing(path,n)
        for p in batch: p['_source_query']=path
        posts.extend(batch)
    except Exception as e:
        print('ERR', path, type(e).__name__, e)

seen=set(); uniq=[]
for p in posts:
    if p['id'] not in seen:
        seen.add(p['id']); uniq.append(p)
posts=uniq

# sample comments for relevant/high-engagement posts
keywords=re.compile(r'bring|packing|pack|essentials|first time|tip|allowed|prohibited|hydration|camelbak|shoes|earplug|charger|locker|bag|security|weather|cold|hot|shuttle|camp|kandi|gum|vicks|totem', re.I)
scored=[]
for p in posts:
    text=(p.get('title','')+' '+(p.get('selftext') or ''))
    rel=10 if keywords.search(text) else 0
    scored.append((rel+p.get('num_comments',0)*2+p.get('score',0)/20+min(len(p.get('selftext') or ''),800)/100,p))
sample=[p for _,p in sorted(scored, key=lambda x: x[0], reverse=True)[:80]]
comments_by_id={}
for p in sample:
    try:
        url=BASE+p['permalink'].rstrip('/')+'.json?limit=120&sort=top&raw_json=1'
        data=get_json(url)
        comm=[]
        def walk(children):
            for c in children:
                if c.get('kind')!='t1': continue
                d=c['data']; body=d.get('body','')
                if body and body not in ('[deleted]','[removed]'):
                    comm.append({'body':body,'score':d.get('score',0)})
                replies=d.get('replies')
                if isinstance(replies,dict): walk(replies['data'].get('children',[]))
        if len(data)>1: walk(data[1]['data'].get('children',[]))
        comments_by_id[p['id']]=comm[:120]
        time.sleep(.3)
    except Exception as e:
        comments_by_id[p['id']]=[]

patterns={
 'hydration pack / water bottle': r'\b(hydration|camelbak|hydropack|water pack|water bottle|refill|bladder)\b',
 'comfortable shoes / insoles': r'\b(shoes|sneakers|boots|insoles|feet|foot|blister|comfortable)\b',
 'earplugs': r'\b(earplugs?|ear plugs?|ear protection|earpeace|loops?|eargasm|hearing)\b',
 'portable charger / battery': r'\b(portable charger|power bank|battery pack|charger|charging)\b',
 'ID / wristband / ticket': r'\b(id|wristband|ticket|pass|passport|license)\b',
 'phone / case / tether': r'\b(phone|iphone|android|case|lanyard|tether|anti[- ]?theft|clutchloop)\b',
 'locker': r'\b(locker|lockers)\b',
 'jacket / hoodie / layers': r'\b(jacket|hoodie|sweater|layers?|cold|windbreaker|pashmina)\b',
 'sunscreen / sun protection': r'\b(sunscreen|sunblock|spf|sunglasses|hat|fan|cooling towel)\b',
 'wet wipes / tissues / sanitizer': r'\b(wet wipes|baby wipes|wipes|tissues|toilet paper|tp|sanitizer)\b',
 'gum / mints': r'\b(gum|mints?|hi[- ]?chew|candy)\b',
 'chapstick / lip balm': r'\b(chapstick|lip balm|aquaphor|vaseline)\b',
 'meds / first aid': r'\b(meds|medicine|ibuprofen|advil|tylenol|allergy|band[- ]?aid|bandaid|blister|moleskin|narcan)\b',
 'clear bag / fanny pack': r'\b(clear bag|bag policy|fanny pack|sling|backpack|purse|camelbak|hydration pack)\b',
 'kandi / trinkets': r'\b(kandi|trinkets?|sprouts?|stickers?|trades?|bracelets?)\b',
 'totem / meetup marker': r'\b(totem|flag|meetup|marker|find friends|group)\b',
 'cash / cards': r'\b(cash|card|credit|debit|apple pay|wallet)\b',
 'mask / bandana / dust': r'\b(mask|bandana|dust|face cover|pashmina)\b',
 'deodorant / personal care': r'\b(deodorant|perfume|cologne|hair tie|tampons?|pads?)\b',
 'snacks': r'\b(snacks?|protein bar|granola|food)\b',
}
cat=Counter(); evidence=defaultdict(list)
item_mentions=Counter()
for p in posts:
    text='\n'.join([p.get('title',''), p.get('selftext') or ''] + [c['body'] for c in comments_by_id.get(p['id'],[])[:50]])
    low=text.lower()
    for name,pat in patterns.items():
        if re.search(pat, low, re.I):
            cat[name]+=1
            if len(evidence[name])<6:
                # find snippet
                snip=(p.get('selftext') or p.get('title') or '')[:500]
                if not snip and comments_by_id.get(p['id']): snip=comments_by_id[p['id']][0]['body'][:500]
                evidence[name].append({'title':p['title'],'score':p.get('score'),'comments':p.get('num_comments'),'url':'https://reddit.com'+p['permalink'],'snippet':snip})

# relevant top posts
relevant=[]
for p in posts:
    text=p.get('title','')+' '+(p.get('selftext') or '')
    if keywords.search(text):
        relevant.append(p)

report={
 'subreddit': SUB,
 'post_count': len(posts),
 'comment_posts_sampled': len(comments_by_id),
 'category_counts': cat.most_common(),
 'evidence': evidence,
 'top_relevant_posts': [{'title':p['title'],'score':p.get('score'),'comments':p.get('num_comments'),'url':'https://reddit.com'+p['permalink'],'selftext':(p.get('selftext') or '')[:900]} for p in sorted(relevant, key=lambda x:(x.get('num_comments',0)*3+x.get('score',0)), reverse=True)[:40]],
 'high_comment_snippets': []
}
# collect useful high score comments mentioning bring tips
for pid,comms in comments_by_id.items():
    post=next((p for p in posts if p['id']==pid),None)
    for c in sorted(comms, key=lambda x:x['score'], reverse=True)[:10]:
        if keywords.search(c['body']):
            report['high_comment_snippets'].append({'post_title': post['title'] if post else pid, 'score': c['score'], 'body': c['body'][:800], 'url':'https://reddit.com'+post['permalink'] if post else ''})
report['high_comment_snippets']=sorted(report['high_comment_snippets'], key=lambda x:x['score'], reverse=True)[:80]

path='/Users/cynthia/.hermes/hermes-agent/edc_festival_reddit_report.json'
open(path,'w').write(json.dumps(report, indent=2))
print(path)
print('posts', len(posts), 'comment posts', len(comments_by_id))
for k,v in cat.most_common(30): print(f'{k}: {v} ({v/len(posts):.1%})')
print('\nTop relevant:')
for p in report['top_relevant_posts'][:10]: print('-',p['score'],p['comments'],p['title'][:100])
